In [12]:
import numpy as np
import torch 
import torchvision
from torchvision import datasets, models, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
#from torchsummary import summary
import matplotlib.pyplot as plt
import os
import time
import copy
from datetime import datetime
from collections import OrderedDict
import json
from torch.utils.tensorboard import SummaryWriter
import random
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

In [13]:
data_dir = 'train_images'

time_run = datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = 'results/' + time_run
os.makedirs(save_dir, exist_ok=True)

transform_mean = transform_std = np.array([None, None, None])

In [14]:
df = pd.read_csv("train_labels.csv", names=["name", "label"], header=None)
df["image_path"] = df["name"].apply(lambda x: os.path.join(data_dir, x))
df = df[["image_path", "label"]]
df.head()

Unnamed: 0,image_path,label
0,train_images\train_0001.png,2012
1,train_images\train_0002.png,2003
2,train_images\train_0003.png,1994
3,train_images\train_0004.png,2014
4,train_images\train_0005.png,2003


In [15]:
df["image_path"][0]

'train_images\\train_0001.png'

In [16]:
n_epoch = 10
batch_size = 16
image_size = 256
input_size = 224
n_classes = 40

In [17]:
if transform_mean.any() == None:
    transform_mean = np.array([0.4914, 0.4822, 0.4465])
if transform_std.any() == None:
    transform_std = np.array([0.2470, 0.2435, 0.2616])


class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, df, data_dir, augment=True):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        
        if augment:
            transform = transforms.Compose([
                transforms.RandomCrop(128, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(mean=transform_mean, std=transform_std)
            ])
        else:
            transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=transform_mean, std=transform_std)
            ])

        self.labels = df["label"]
        self.data_dir = data_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = df["image_path"][idx]
        image = Image.open(img_name).convert("RGB")   
        image = self.transform(image)
        
        label = df["label"][idx] - 1979
        sample = {'image': image, 'label': label}
        return sample


train_df, val_df = train_test_split(
                    df, test_size=0.2, random_state=42, shuffle=True, stratify=df["label"]
                    )
    
    

train_dataset = ImageDataset(train_df, data_dir, augment=True)
val_dataset = ImageDataset(val_df, data_dir, augment=False)


train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True, 
                                           num_workers=0,
#                                           pin_memory=True,
                                           drop_last=True
                                          )

val_loader = torch.utils.data.DataLoader(val_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False, 
                                           num_workers=0,
#                                           pin_memory=True,
                                           drop_last=False
                                         )

train_dataset_size = len(train_dataset)
val_dataset_size = len(val_dataset)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

In [18]:
""" 
try writing from scratch the Shake-Shake regularization (Gastaldi, 2017), SOTA error rate 2.86%
partly referred to https://github.com/hysts/pytorch_shake_shake when I had no idea
I did not implement all the features mentioned in the paper, 
but I consider I have included most juice out of it.
"""
######################## This part function not verified ###################
def initialize_weights(module):
    if isinstance(module, nn.Conv2d):
        nn.init.kaiming_normal_(module.weight.data, mode='fan_out')
    elif isinstance(module, nn.BatchNorm2d):
        module.weight.data.fill_(1)
        module.bias.data.zero_()
    elif isinstance(module, nn.Linear):
        module.bias.data.zero_()
############################################################################

class ResPath(nn.Module):
    def __init__(self, in_chan, out_chan, stride):
        super(ResPath, self).__init__()
        self.conv1 = nn.Conv2d(in_chan, out_chan, 3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_chan)
        self.conv2 = nn.Conv2d(out_chan, out_chan, 3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_chan)
        
    def forward(self, x):
        x = self.bn1(self.conv1(F.relu(x, inplace=False)))
        x = self.bn2(self.conv2(F.relu(x, inplace=False)))
        return x

class DownSamplePath(nn.Module):
    def __init__(self, in_chan, out_chan):
        super(DownSamplePath, self).__init__()
        self.conv1 = nn.Conv2d(in_chan, in_chan, 1, 1, 0, bias=False)
        self.conv2 = nn.Conv2d(in_chan, in_chan, 1, 1, 0, bias=False) # although identical, but still need to separate otherwise parameters will be mixed? NEED VERIFY
        self.bn1 = nn.BatchNorm2d(out_chan)
        
    def forward(self, x):
        x = F.relu(x, inplace=False)
        
        x1 = F.avg_pool2d(x, 1, stride=2, padding=0)
        x1 = self.conv1(x1)
        
        x2 = F.pad(x[:, :, 1:, 1:], (0, 1, 0, 1))   # not familiar with this
        x2 = F.avg_pool2d(x2, 1, stride=2, padding=0)
        x2 = self.conv2(x2)
        
        y = torch.cat([x1, x2], dim=1)
        y = self.bn1(y)
        return y
    
class ShakeFunc(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x1, x2, alpha, beta):
        ctx.save_for_backward(x1, x2, alpha, beta)
        y = x1 * alpha + x2 * (1 - alpha)
        return y
    
    @staticmethod
    def backward(ctx, grad_output):
        x1, x2, alpha, beta = ctx.saved_tensors
        grad_x1 = grad_x2 = grad_alpha = grad_beta = None
        grad_x1 = grad_output * beta
        grad_x2 = grad_output * (1 - beta)
        """ For better efficiency, use instead:
        if ctx.needs_input_grad[0]:  
            grad_x1 = grad_output * beta
        if ctx.needs_input_grad[1]:
            grad_x2 = grad_output * (1 - beta)
        """
        return grad_x1, grad_x2, grad_alpha, grad_beta
ShakeFunc = ShakeFunc.apply
    
class ResBlock(nn.Module):
    def __init__(self, in_chan, out_chan, stride):
        super(ResBlock, self).__init__()
        self.path1 = ResPath(in_chan, out_chan, stride)
        self.path2 = ResPath(in_chan, out_chan, stride)
        
        self.down_sample = nn.Sequential()
        if in_chan != out_chan:
            self.down_sample.add_module("DownSamplePath", DownSamplePath(in_chan, out_chan))
        
    def get_alpha_beta(self, batch_size, is_training, device):
        """only Shake-Shake-Image is implemented here"""
        if is_training:
            alpha = torch.rand((batch_size, 1, 1, 1))
            beta = torch.rand((batch_size, 1, 1, 1))
        else:
            alpha = torch.ones((batch_size, 1, 1, 1)) * 0.5
            beta = torch.ones((batch_size, 1, 1, 1)) * 0.5 
            #alpha = torch.FloatTensor([0.5])
            #beta = torch.FloatTensor([0.5])
        alpha = alpha.to(device)
        beta = beta.to(device)
        return alpha, beta
        
    def forward(self, x):
        x1 = self.path1(x)
        x2 = self.path2(x)
#        is_training = True if model.train() else False
        alpha, beta = self.get_alpha_beta(x.size(0), self.training, x.device)
        y = ShakeFunc(x1, x2, alpha, beta)
        return self.down_sample(x) + y    
    
class ResStage(nn.Module):
    """Might be a bit redundant to define a new class for this? I made this class just for clarity"""
    def __init__(self, in_chan, out_chan, stride, n_blocks=4):
        super(ResStage, self).__init__()
        self.stage = nn.Sequential()
        for idx in range(n_blocks):
            if idx == 0:
                self.stage.add_module("block{}".format(idx+1), ResBlock(in_chan, out_chan, stride=stride))
            else:
                self.stage.add_module("block{}".format(idx+1), ResBlock(out_chan, out_chan, stride=1))      

    def forward(self, x):
        x = self.stage(x) # this looks a bit stupid...
        return x
    
class Shakeshake(nn.Module):
    def __init__(self, n_channels, n_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(3, n_channels[0], 3, 1, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(n_channels[0])
        self.stage1 = ResStage(n_channels[0], n_channels[0], stride=1)
        self.stage2 = ResStage(n_channels[0], n_channels[1], stride=2)
        self.stage3 = ResStage(n_channels[1], n_channels[2], stride=2)
        self.fc1 = nn.Linear(n_channels[2], n_classes)
    
        # initialize weights
        self.apply(initialize_weights)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)), inplace=True)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = F.adaptive_max_pool2d(x, 1)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

In [19]:
base_chan = 32
n_channels = [base_chan, base_chan * 2, base_chan * 4]

#model = Network()
model = Shakeshake(n_channels, n_classes)
model = model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4, nesterov=True)

# Decay LR by a factor of 0.1 every 7 epochs
#scheduler = lr_scheduler.StepLR(optimizer, step_size=9, gamma=0.33)

T_max = n_epoch * train_dataset_size
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=0, last_epoch=-1)

global_step = 0

In [20]:
## auxilliary functions

def get_config():
    config = OrderedDict({'name':time_run,
                          'nepoch':n_epoch,
                          'base_chan':base_chan,
                          'loss_func':str(loss_func),
                          'optimizer':str(optimizer),
                          'scheduler':str(scheduler.__dict__),
                          'model':str(model),
                          'train_loader':str(train_loader.__dict__), 
                          'val_loader':str(val_loader.__dict__)})
    return config

In [21]:
def train(epoch, model, loss_func, train_loader, optimizer, scheduler, writer):
    global global_step
    
    since = time.time()

    model.train()
    
    running_loss = 0.0            
    running_correct = 0

    for i, data in enumerate(train_loader):
        global_step +=1
        inputs, labels = data["image"], data["label"]

#        if i == 0:
#            image = torchvision.utils.make_grid(
#                inputs, normalize=True, scale_each=True)
#            writer.add_image('Train/Image', image, epoch)
        
        scheduler.step()        
        
        writer.add_scalar('Train/LearningRate',
                            scheduler.get_lr()[0], global_step)
        
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = loss_func(outputs, labels)
        _, predicted = torch.max(outputs.data, 1)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_correct += np.double(torch.sum(predicted == labels.data))
        
        writer.add_scalar('Train/RunningLoss', running_loss , global_step)
        writer.add_scalar('Train/RunningAccuracy', running_correct , global_step)
        
    train_loss = running_loss / train_dataset_size
    train_acc = running_correct / train_dataset_size

    elapsed = time.time() - since
    
    writer.add_scalar('Train/Loss', train_loss, epoch)
    writer.add_scalar('Train/Accuracy', train_acc, epoch)
    writer.add_scalar('Train/Time', elapsed, epoch)

    train_log = OrderedDict({
    'epoch':
    epoch,
    'train':
    OrderedDict({
        'loss': train_loss,
        'accuracy': train_acc,
        'time': elapsed,
        }),
    })
      
    print('Train Loss:{:.6f} Accuracy:{:.4f} \nTime:{:.1f}s'.format(train_loss, train_acc, elapsed))        
        
    return train_log, train_acc

def val(epoch, model, loss_func, val_loader, writer):
    global global_step
    since = time.time()
    
    model.eval()
    
    running_loss = 0.0            
    running_correct = 0
    
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            global_step +=1
            inputs, labels = data["image"], data["label"]
            
#            if i == 0:
#                image = torchvision.utils.make_grid(
#                    inputs, normalize=True, scale_each=True)
#                writer.add_image('Test/Image', image, epoch)
            
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = loss_func(outputs, labels)
            _, predicted = torch.max(outputs.data, 1)

            running_loss += loss.item()
            running_correct += np.double(torch.sum(predicted == labels.data))
    val_loss = running_loss / val_dataset_size
    val_acc = running_correct / val_dataset_size

    elapsed = time.time() - since

    if epoch > 0:
        writer.add_scalar('Val/Loss', val_loss, epoch)
    writer.add_scalar('Val/Accuracy', val_acc, epoch)
    writer.add_scalar('Val/Time', elapsed, epoch)    
    
    val_log = OrderedDict({
    'epoch':
    epoch,
    'val':
    OrderedDict({
        'loss': val_loss,
        'accuracy':val_acc,
        'time': elapsed,
        }),
    })
    
    print('Val Loss:{:.6f} Accuracy:{:.4f} \nTime:{:.1f}s'.format(val_loss, val_acc, elapsed))  
    
    return val_log, val_acc



def main():
    print(time_run)
    
    config = get_config()
    with open(save_dir+'/{}_config.json'.format(time_run), 'w') as fout:
        json.dump(config, fout, indent=2)
        
#    summary(model, (3, 32, 32))
#    print('no. epoch:', nepoch, "batch size:", BATCH_SIZE, "optimizer:", optimizer)
    train_since = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

######################## This part function not verified ###################    
    # set random seed
    seed = 17 # arbitrary
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
###########################################################################    
    writer = SummaryWriter(save_dir)
    
    # run test before start training? whats that??????????
    val(0, model, loss_func, val_loader, writer)
    
    epoch_logs = []
    train_accs = []
    val_accs = []
    
    for epoch in range(n_epoch):
        print('Epoch {}/{}'.format(epoch, n_epoch - 1))
        print('-' * 10)
        
        train_log, train_acc = train(epoch, model, loss_func, train_loader, optimizer, scheduler, writer)
        val_log, val_acc = val(epoch, model, loss_func, val_loader, writer)
        
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        

        print()
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        epoch_log = train_log.copy()
        epoch_log.update(val_log)
        epoch_logs.append(epoch_log)
        with open(save_dir+'/{}_log.json'.format(time_run), 'w') as fout:
            json.dump(epoch_logs, fout, indent=2)
        
        if epoch % 50 == 0 and epoch !=0:
            torch.save(model.state_dict(), save_dir+'/{}_ep{}_acc{:.4f}.pth'.format(time_run, epoch, val_acc))
    
    accuracies = {'train_accuracy':np.array(train_accs), 'val_accuracy':np.array(val_accs)}
    np.save(save_dir+'/{}_accs.npy'.format(time_run), accuracies)
    
    train_elapsed = time.time() - train_since
    print('Training complete in {:.0f}m {:.0f}s'.format(train_elapsed // 60, train_elapsed % 60))
    print('Best val Acc: {:.4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    torch.save(model.state_dict(), save_dir+'/{}_bestacc{:.4f}.pth'.format(time_run, best_acc))

In [22]:
if __name__ == '__main__':
    main()

20191228_215408
Val Loss:0.604396 Accuracy:0.0149 
Time:15.3s
Epoch 0/9
----------
Train Loss:0.297156 Accuracy:0.0555 
Time:75.3s
Val Loss:0.218177 Accuracy:0.0583 
Time:15.3s

Epoch 1/9
----------
Train Loss:0.219833 Accuracy:0.0582 
Time:75.9s
Val Loss:0.215056 Accuracy:0.0740 
Time:15.5s

Epoch 2/9
----------
Train Loss:0.216995 Accuracy:0.0763 
Time:75.1s
Val Loss:0.211241 Accuracy:0.0852 
Time:15.3s

Epoch 3/9
----------
Train Loss:0.215322 Accuracy:0.0776 
Time:75.7s
Val Loss:0.208127 Accuracy:0.0897 
Time:15.3s

Epoch 4/9
----------
Train Loss:0.213207 Accuracy:0.0836 
Time:75.5s
Val Loss:0.209501 Accuracy:0.0919 
Time:15.3s

Epoch 5/9
----------
Train Loss:0.212324 Accuracy:0.0869 
Time:75.5s
Val Loss:0.204070 Accuracy:0.1181 
Time:15.3s

Epoch 6/9
----------
Train Loss:0.211274 Accuracy:0.0883 
Time:75.3s
Val Loss:0.202441 Accuracy:0.1151 
Time:15.3s

Epoch 7/9
----------
Train Loss:0.211023 Accuracy:0.0864 
Time:75.3s
Val Loss:0.202221 Accuracy:0.1076 
Time:15.3s

Epoch 8/9
