In [81]:
import torch
import torchvision
from torchvision import datasets, transforms
import torch.nn as nn
from torchvision import models
import torch.optim as optim

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

train_dataset = datasets.VOCDetection(root='data', year='2007', image_set='train', download=True, transform=transform)
val_dataset = datasets.VOCDetection(root='data', year='2007', image_set='val', download=True, transform=transform)

VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
               'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
               'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']

def multi_label_target(annotation):
    labels = [0] * len(VOC_CLASSES)
    for obj in annotation['annotation']['object']:
        class_name = obj['name']
        if class_name in VOC_CLASSES:
            labels[VOC_CLASSES.index(class_name)] = 1
    return torch.tensor(labels, dtype=torch.float32)

class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, voc_dataset):
        self.voc_dataset = voc_dataset

    def __len__(self):
        return len(self.voc_dataset)

    def __getitem__(self, idx):
        image, target = self.voc_dataset[idx]
        labels = multi_label_target(target)
        return image, labels

train_dataset = VOCDataset(train_dataset)
val_dataset = VOCDataset(val_dataset)


Using downloaded and verified file: data/VOCtrainval_06-Nov-2007.tar
Extracting data/VOCtrainval_06-Nov-2007.tar to data
Using downloaded and verified file: data/VOCtrainval_06-Nov-2007.tar
Extracting data/VOCtrainval_06-Nov-2007.tar to data


In [91]:

class MyResNet(nn.Module):
    def __init__(self, num_classes):
        super(MyResNet, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.resnet(x)
        return x

In [98]:
from tqdm import tqdm
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model1 = MyResNet(num_classes=len(VOC_CLASSES))
optimizer = optim.Adam(model1.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)

for epoch in range(3):
    print(f"Epoch {epoch + 1}:")
    model1.train()
    for batch_item in tqdm(train_dataloader, desc="training"):
        images = batch_item[0].to(device)
        labels = batch_item[1].to(device)

        # Forward pass
        res = model1(images)
        loss = loss_fn(res, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation loop
    predicted = []
    gts = []
    model1.eval()


print("Finished Training")


Epoch 1:


training: 100%|██████████| 157/157 [00:29<00:00,  5.36it/s]


Epoch 2:


training: 100%|██████████| 157/157 [00:26<00:00,  5.82it/s]


Epoch 3:


training: 100%|██████████| 157/157 [00:27<00:00,  5.81it/s]

Finished Training



