In [142]:
#!g1.1

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.models import resnet18, resnet50

from PIL import Image

import numpy as np
import csv
import os

from tqdm.notebook import tqdm
import wandb

import gc

In [143]:
#!g1.1
def empty_cache():
    torch.cuda.empty_cache()
    gc.collect()

In [144]:
#!g1.1
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f4ca4482910>

# Data fetchers

In [145]:
#!g1.1

class ImagesDataset(Dataset):
    def __init__(self, img_dir, labels, transform=None):
        self.samples = []

        for label in labels:
            img_path = os.path.join(img_dir, label[0])
            self.samples.append((img_path, int(label[1])))

        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, label = self.samples[index]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, label

In [146]:
#!g1.1

class ImagesTestDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.samples = []
        
        for filename in os.listdir(img_dir):
            img_path = os.path.join(img_dir, filename)
            self.samples.append((img_path, filename))

        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, filename = self.samples[index]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, filename

In [147]:
#!g1.1
data_dir = '/home/jupyter/mnt/datasets/bhw1'
trainval_img_dir = os.path.join(data_dir, 'trainval/trainval')
test_img_dir = os.path.join(data_dir, 'test/test')
labels_filename = os.path.join(data_dir, 'labels.csv')

In [148]:
#!g1.1

def loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=64, val_size=0.3):
    labels = None
    with open(labels_filename, "r") as labels_file:
        csvreader = csv.reader(labels_file)
        labels = np.array([row for row in csvreader])[1:]
    
    train_idx, val_idx = random_split(np.arange(len(labels)), (1 - val_size, val_size))

    train_dataset = ImagesDataset(trainval_img_dir, labels[train_idx.indices], transform=train_transform)
    val_dataset = ImagesDataset(trainval_img_dir, labels[val_idx.indices], transform=test_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader

# Model state restorer

In [149]:
#!g1.1
class SaveBestModel:
    def __init__(self, best_val_loss=np.inf):
        self.best_val_loss = best_val_loss
        
    def __call__(self, val_loss, epoch, model, optimizer, scheduler=None, model_path='model/best_model.pth'):
        if val_loss < self.best_val_loss:
            self.best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict() if scheduler else {}
                }, model_path
            )
            print('New best model with loss {:.5f} is saved'.format(val_loss))

def save_model(epoch, model, optimizer, model_path='model/final_model.pth'):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict() if scheduler else {}
                }, model_path
    )
    print('Model is saved')
    
def load_model(model, optimizer, scheduler=None, model_path='model/best_model.pth'):
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    
    if scheduler:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    return model, optimizer, epoch, scheduler

# Training/Validation procedures

In [150]:
#!g1.1

def train_epoch(model, optimizer, criterion, train_loader, device, tqdm_desc):
    model.train()
    train_acc, train_loss = 0.0, 0.0

    for images, labels in tqdm(train_loader, desc=tqdm_desc):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        logits = model(images)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_acc += (logits.argmax(dim=1) == labels).sum().item()
        train_loss += loss.item() * labels.shape[0]

    train_acc /= len(train_loader.dataset)
    train_loss /= len(train_loader.dataset)

    return train_acc, train_loss


@torch.no_grad()
def val_epoch(model, criterion, val_loader, device, tqdm_desc):
    model.eval()
    val_acc, val_loss = 0.0, 0.0

    for images, labels in tqdm(val_loader, desc=tqdm_desc):
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images)
        loss = criterion(logits, labels)

        val_acc += (logits.argmax(dim=1) == labels).sum().item()
        val_loss += loss.item() * labels.shape[0]

    val_acc /= len(val_loader.dataset)
    val_loss /= len(val_loader.dataset)

    return val_acc, val_loss


def train(model, optimizer, criterion, scheduler, train_loader, val_loader, device, num_epochs, model_saver, continue_training=True, model_path='model/best_model.pth', start_epoch=0):
    
    if continue_training:
        model, optimizer, start_epoch, scheduler = load_model(model, optimizer, scheduler, model_path)
    
    for epoch in range(start_epoch + 1, num_epochs + 1):
        train_acc, train_loss = train_epoch(model, optimizer, criterion, train_loader, device, f'Training epoch {epoch}/{num_epochs}')
        val_acc, val_loss = val_epoch(model, criterion, val_loader, device, f'Validating epoch {epoch}/{num_epochs}')

        if scheduler is not None:
            scheduler.step(val_loss)
        
        wandb.log({'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc})
        model_saver(val_loss, epoch, model, optimizer, scheduler, model_path)

# Models

In [151]:
#!g1.1
class TestNet(nn.Module):
    """
        Model for checking that training pipeline is working
    """
    def __init__(self):
        super().__init__()
        
        self.fc = nn.Linear(3 * 64 * 64, 200)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.fc(x)
    
class Resnet18(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet18(pretrained=False)
        self.model.fc = nn.Linear(in_features=512, out_features=n_classes)
        
    def forward(self, x):
        return self.model(x)
    
class Resnet50(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet50(pretrained=False)
        self.model.fc = nn.Sequential(
            nn.Linear(in_features=2048, out_features=1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=n_classes)
        ) 
        
    def forward(self, x):
        return self.model(x)
    
class EnhancedResnet50(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet50(pretrained=False)
        self.model.fc = nn.Sequential(
            nn.Linear(in_features=2048, out_features=1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=n_classes)
        ) 
        
    def forward(self, x):
        return self.model(x)

# Training

In [158]:
#!g1.1
wandb.init(project="bhw1")
#!wandb login SOME_API_KEY

0,1
train_acc,▁▂▂▄▃▅▆▇▇█
train_loss,█▇▆▄▆▃▃▂▂▁
val_acc,▁▁▂▃▃▅▄▅▇█
val_loss,█▇▅▅▅▃▄▄▂▁

0,1
train_acc,0.1046
train_loss,4.16675
val_acc,0.11967
val_loss,4.05928


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666989893333266, max=1.0)…

In [159]:
#!g1.1
CUDA = 0
n_classes = 200
batch_size = 64

train_transform = T.Compose([
    T.RandomResizedCrop(224, scale=(0.5, 1.0)),
    T.RandomHorizontalFlip(),
    #T.RandomApply([T.RandomRotation(degrees=30)], p=0.5),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

test_transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

    
train_loader, val_loader = loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=batch_size, val_size=0.3)

In [None]:
#!g1.1
model_saver = SaveBestModel()

device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')
model = EnhancedResnet50().to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)
#optimizer = optim.Adam(model.parameters(), lr=1e-4)
#scheduler = None

n_epochs = 20
train(model, optimizer, criterion, scheduler, train_loader, val_loader, device, n_epochs, model_saver, continue_training=False, model_path='model/best_model2.pth')



Training epoch 1/20:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 1/20:   0%|          | 0/469 [00:00<?, ?it/s]

New best model with loss 5.11582 is saved


Training epoch 2/20:   0%|          | 0/1094 [00:00<?, ?it/s]

In [113]:
#!g1.1
train(model, optimizer, criterion, scheduler, train_loader, val_loader, device, 50, model_saver, continue_training=True)

Training epoch 35/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 35/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 36/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 36/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 37/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 37/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 38/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 38/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 39/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 39/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 40/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 40/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 41/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 41/60:   0%|          | 0/469 [00:00<?, ?it/s]

New best model with loss 2.06619 is saved


Training epoch 42/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 42/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 43/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 43/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 44/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 44/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 45/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 45/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 46/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 46/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 47/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 47/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 48/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 48/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 49/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 49/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 50/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 50/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 51/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 51/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 52/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 52/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 53/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 53/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 54/60:   0%|          | 0/1094 [00:00<?, ?it/s]

Validating epoch 54/60:   0%|          | 0/469 [00:00<?, ?it/s]

Training epoch 55/60:   0%|          | 0/1094 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Inference

In [114]:
#!g1.1
best_model, optimizer, start_epoch, scheduler = load_model(model, optimizer)

In [115]:
#!g1.1
test_dataset = ImagesTestDataset(test_img_dir, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [116]:
#!g1.1
def predict(model, test_loader, device):
    model.eval()
    labels = [["Id", "Label"]]
    
    for images, filenames in tqdm(test_loader, desc='Testing'):
        images = images.to(device)
        
        logits = model(images)
        preds = logits.argmax(dim=1)
        
        for filename, pred in zip(filenames, preds):
            labels.append([filename, pred.item()])
    
    return labels

In [117]:
#!g1.1
labels = predict(best_model, test_loader, device)

Testing:   0%|          | 0/157 [00:00<?, ?it/s]

In [118]:
#!g1.1
with open('labels_test.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    
    for label in labels:
        writer.writerow(label)

In [105]:
#!g1.1
#print(val_epoch(best_model, criterion, val_loader, device, "Validating"))