In [34]:
#!g1.1

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.models import resnet18, resnet50
from torch.optim.swa_utils import AveragedModel

from PIL import Image

import numpy as np
import csv
import os

from tqdm import tqdm
import wandb

import gc

from skimage.io import imshow
import matplotlib.pyplot as plt

import sys

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

#!g1.1
def empty_cache():
    torch.cuda.empty_cache()
    gc.collect()

class ImagesDataset(Dataset):
    def __init__(self, img_dir, labels, transform=None):
        self.samples = []

        for label in labels:
            img_path = os.path.join(img_dir, label[0])
            self.samples.append((img_path, int(label[1])))

        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, label = self.samples[index]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, label
    
#!g1.1

class ImagesTestDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.samples = []
        
        for filename in os.listdir(img_dir):
            img_path = os.path.join(img_dir, filename)
            self.samples.append((img_path, filename))

        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, filename = self.samples[index]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, filename
    
def read_labels(labels_filename):
    labels = None
    with open(labels_filename, "r") as labels_file:
        csvreader = csv.reader(labels_file)
        labels = np.array([row for row in csvreader])[1:]
    return labels    

#!g1.1
def loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=64, val_size=0.3):    
    labels = read_labels(labels_filename)
    
    #torch 12
    num_val_labels = int(val_size * len(labels))
    train_idx, val_idx = random_split(np.arange(len(labels)), (len(labels) - num_val_labels, num_val_labels))
    
    # torch 13
    #train_idx, val_idx = random_split(np.arange(len(labels)), (1 - val_size, val_size))

    train_dataset = ImagesDataset(trainval_img_dir, labels[train_idx.indices], transform=train_transform)
    val_dataset = ImagesDataset(trainval_img_dir, labels[val_idx.indices], transform=test_transform)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader

def subset_loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=64, val_size=0.3, ratio=0.5):
    labels = read_labels(labels_filename)
    
    #torch 12
    num_val_labels = int(val_size * len(labels))
    train_idx, val_idx = random_split(np.arange(len(labels)), (len(labels) - num_val_labels, num_val_labels))
    
    train_idx = torch.utils.data.Subset(train_idx, np.arange(int(len(train_idx) * ratio)))
    val_idx = torch.utils.data.Subset(train_idx, np.arange(int(len(val_idx) * ratio)))
    
    # torch 13
    #train_idx, val_idx = random_split(np.arange(len(labels)), (1 - val_size, val_size))

    train_dataset = ImagesDataset(trainval_img_dir, labels[train_idx.indices], transform=train_transform)
    val_dataset = ImagesDataset(trainval_img_dir, labels[val_idx.indices], transform=test_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    return train_loader, val_loader

#!g1.1
class SaveBestModel:
    def __init__(self, best_val_loss=np.inf):
        self.best_val_loss = best_val_loss
        
    def __call__(self, val_loss, epoch, model, optimizer, scheduler=None, model_path='model/best_model.pth'):
        if val_loss < self.best_val_loss:
            self.best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict() if scheduler else {}
                }, model_path
            )
            print('New best model with loss {:.5f} is saved'.format(val_loss))

def save_model(epoch, model, optimizer, scheduler=None, model_path='model/final_model.pth'):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict() if scheduler else {},
                }, model_path
    )
    print('Model is saved')
    
def load_model(model, optimizer, scheduler=None, model_path='model/best_model.pth'):
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    
    if scheduler:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    return model, optimizer, epoch, scheduler

#!g1.1

def train_epoch(model, optimizer, criterion, train_loader, device, tqdm_desc):
    model.train()
    train_acc, train_loss = 0.0, 0.0

    for images, labels in tqdm(train_loader, desc=tqdm_desc):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        logits = model(images)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_acc += (logits.argmax(dim=1) == labels).sum().item()
        train_loss += loss.item() * labels.shape[0]

    train_acc /= len(train_loader.dataset)
    train_loss /= len(train_loader.dataset)

    return train_acc, train_loss


@torch.no_grad()
def val_epoch(model, criterion, val_loader, device, tqdm_desc):
    model.eval()
    val_acc, val_loss = 0.0, 0.0

    for images, labels in tqdm(val_loader, desc=tqdm_desc):
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images)
        loss = criterion(logits, labels)

        val_acc += (logits.argmax(dim=1) == labels).sum().item()
        val_loss += loss.item() * labels.shape[0]

    val_acc /= len(val_loader.dataset)
    val_loss /= len(val_loader.dataset)

    return val_acc, val_loss


def train(model, optimizer, criterion, scheduler, train_loader, val_loader, device, num_epochs, model_saver, continue_training=True, model_path='model/best_model.pth', start_epoch=0):
    
    if continue_training:
        model, optimizer, start_epoch, scheduler = load_model(model, optimizer, scheduler, model_path)
    
    for epoch in range(start_epoch + 1, num_epochs + 1):
        train_acc, train_loss = train_epoch(model, optimizer, criterion, train_loader, device, f'Training epoch {epoch}/{num_epochs}')
        val_acc, val_loss = val_epoch(model, criterion, val_loader, device, f'Validating epoch {epoch}/{num_epochs}')

        if scheduler is not None:
            scheduler.step()
            #scheduler.step(val_loss)
        
        print({'epoch': epoch, 'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc})
        wandb.log({'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc})
        model_saver(val_loss, epoch, model, optimizer, scheduler, model_path)

def params_split_for_wd(model, verbose=False):
    white_list_wd = []
    black_list_wd = []
    all_modules = (nn.Linear, nn.Conv2d, nn.BatchNorm2d, nn.BatchNorm1d)
    
    for module_name, module in model.named_modules():
        if not isinstance(module, all_modules):
            continue
        for param_name, _ in module.named_parameters():
            name = f"{module_name}.{param_name}" if module_name else param_name

            if name.endswith("bias"):
                black_list_wd += [name]
            elif name.endswith("weight"):
                if isinstance(module, (nn.Linear, nn.Conv2d)):
                    white_list_wd += [name]
                elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm1d)):
                    black_list_wd += [name]
    if verbose:
        print("White list WD")
        print(white_list_wd)
        print("Black list WD")
        print(black_list_wd)

    model_params = {param_name: param for param_name, param in model.named_parameters()}
    wd_params = [model_params[param] for param in white_list_wd]
    no_wd_params = [model_params[param] for param in black_list_wd]
    
    return wd_params, no_wd_params

#!g1.1
class TestNet(nn.Module):
    """
        Model for checking that training pipeline is working
    """
    def __init__(self):
        super().__init__()
        
        self.fc = nn.Linear(3 * 224 * 224, 200)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.fc(x)
    
class Resnet18(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet18(pretrained=False)
        self.model.fc = nn.Linear(in_features=512, out_features=n_classes)
        
    def forward(self, x):
        return self.model(x)
    
class Resnet50(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet50(pretrained=False)
        self.model.fc = nn.Sequential(
            nn.Linear(in_features=2048, out_features=1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=n_classes)
        ) 
        
    def forward(self, x):
        return self.model(x)
    
class EnhancedResnet50(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet50(pretrained=False)
        self.model.fc = nn.Sequential(
            nn.Linear(in_features=2048, out_features=1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Linear(in_features=1024, out_features=512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=n_classes)
        ) 
        
    def forward(self, x):
        return self.model(x)

def train_model():
    model_saver = SaveBestModel()
    
    print("Get loaders...")
    train_loader, val_loader = loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=batch_size, val_size=0.3)
    #train_loader, val_loader = subset_loaders(trainval_img_dir, labels_filename, train_transform, test_transform, batch_size=batch_size, val_size=0.3, ratio=0.01)

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    wd_params, no_wd_params = params_split_for_wd(model, verbose=False)
    optimizer = torch.optim.SGD([
            {"params": wd_params, "weight_decay": 2e-05},
            {"params": no_wd_params, "weight_decay": 0.0},
        ], lr=1e-3, momentum=0.9)

    #scheduler = None
    #optimizer = optim.AdamW(model.parameters(), lr=1e-4)

    n_epochs = 100
    lr_warmup_epochs = 5
    lr_warmup_decay = 0.01

    warmup_lr_scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=lr_warmup_decay, total_iters=lr_warmup_epochs)
    main_lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs - lr_warmup_epochs, eta_min=0)

    scheduler = optim.lr_scheduler.SequentialLR(
        optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[lr_warmup_epochs], verbose=True
    )

    #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

    print("Training start...")
    wandb.login(key="91898ab676432e8d5689a2ce4a88f7131dc1e45c")
    wandb.init(project="bhw1")

    os.mkdir('model')
    train(model, optimizer, criterion, scheduler, train_loader, val_loader, device, n_epochs, model_saver, continue_training=False, model_path='model/best_model.pth')


@torch.no_grad()
def predict(model, test_loader, device):
    model.eval()
    labels = [["Id", "Label"]]
    
    for images, filenames in tqdm(test_loader, desc='Testing'):
        images = images.to(device)
        
        logits = model(images)
        preds = logits.argmax(dim=1)
        
        for filename, pred in zip(filenames, preds):
            labels.append([filename, pred.item()])
    
    return labels

def test_model():
    print("Get test loader...")
    test_dataset = ImagesTestDataset(test_img_dir, transform=test_transform)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    print("Predicting...")
    labels = predict(model, test_loader, device)

    print("Writing to file...")
    with open('labels_test.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)

        for label in labels:
            writer.writerow(label)
    print("Finished!")

empty_cache()

data_dir = '../input/bhw-1-deep-learning/bhw1-dataset/'
trainval_img_dir = os.path.join(data_dir, 'trainval/')
test_img_dir = os.path.join(data_dir, 'test/')
labels_filename = os.path.join(data_dir, 'labels.csv')

#!g1.1
n_classes = 200
batch_size = 64

train_transform = T.Compose([
    T.RandomResizedCrop(224, scale=(0.1, 1.0)),
    T.RandomHorizontalFlip(),
    T.TrivialAugmentWide(),
    #T.RandomGrayscale(p=0.1),
    #T.RandomApply([T.RandomRotation(degrees=30)], p=0.5),
    T.ToTensor(),
    T.RandomErasing(p=0.1),
    T.Normalize(mean=(0.5695764, 0.5449682, 0.4936079), std=(0.24523072, 0.2391582, 0.25806385)),
])

test_transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.5611811, 0.53794473, 0.48733008), std=(0.24465169, 0.23830907, 0.25577575))
    #T.Normalize(mean=(0.5695764, 0.5449682, 0.4936079), std=(0.24523072, 0.2391582, 0.25806385))
])

#CUDA = 0 if len(sys.argv) == 1 else int(sys.argv[1])
CUDA = 0

print(f'Trying working on device cuda:{CUDA}...')
device = torch.device(f'cuda:{CUDA}' if torch.cuda.is_available() else 'cpu')
print(device)

print("Load model...")
model = EnhancedResnet50().to(device)
model.load_state_dict(torch.load('../input/model077/best_model4.pth', map_location=f'cuda:{CUDA}')['model_state_dict'])
train_model()
#model.model.fc = nn.Linear(in_features=2048, out_features=n_classes)
#model = model.to(device)
#model = EnhancedResnet50().to(device)

Trying working on device cuda:0...
cuda:0
Load model...
Get test loader...
Predicting...


Testing: 100%|██████████| 10000/10000 [01:40<00:00, 99.26it/s]

Writing to file...
Finished!



