In [1]:
import torch
import random
import torchvision
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import argparse,os,time
import os
import time
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
num_gpus=4

In [2]:
data = pd.read_csv('train_augmented.csv', index_col=0)
val_data=pd.read_csv('val_augmented.csv', index_col=0)

In [3]:
y_data=data["digit"].values
y_data_val=val_data["digit"].values

In [4]:
y_data.shape

(81900,)

In [5]:
x_data=data.loc[:,"0":"783"].values
x_data_val=val_data.loc[:,"0":"783"].values

In [6]:
x_data_train=x_data
x_data_train.shape

(81900, 784)

In [7]:
x_data_test=x_data_val
x_data_test.shape

(20500, 784)

In [8]:
x_data_train=x_data_train/x_data_train.max()
x_data_test=x_data_test/x_data_test.max()
x_data_train.shape

(81900, 784)

In [9]:
class CustomDataset(Dataset):
    def __init__(self,x_dat,y_dat):
        x = x_dat
        y = y_dat
        self.len = x.shape[0]
        y=y.astype('int')
        x=x.astype('float32')
        self.x_data = torch.tensor(x)
        self.y_data = torch.tensor(y)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [10]:
batch_size=256
train_dataset = CustomDataset(x_data_train,y_data)
train_loader = DataLoader(dataset=train_dataset,pin_memory=True,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=60,drop_last=True)
test_dataset = CustomDataset(x_data_test,y_data_val)
test_loader = DataLoader(dataset=test_dataset,pin_memory=True,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=60,drop_last=True)

In [11]:
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

In [12]:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

In [13]:
class Bottleneck(nn.Module):

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


In [14]:
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=10, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = x.view(batch_size,1,28,28)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(batch_size,-1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)

In [15]:
model = ResNet(block=Bottleneck, layers=[3, 8, 36, 3])
model=model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),weight_decay=0.001)

In [16]:
trn_loss_list = []
val_loss_list = []
total_epoch=200
model_char="res152"
model_name=""
patience=5
start_early_stop_check=0
saving_start_epoch=10

for epoch in range(total_epoch):
    trn_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs=inputs.cuda()
            labels=labels.cuda()
        # grad init
        optimizer.zero_grad()
        # forward propagation
        output= model(inputs)
        # calculate loss
        loss=criterion(output, labels)
        # back propagation 
        loss.backward()
        # weight update
        optimizer.step()
        
        # trn_loss summary
        trn_loss += loss.item()
        # del (memory issue)
        del loss
        del output
    with torch.no_grad():
        val_loss = 0.0
        mis_match = 0
        for j, val in enumerate(test_loader):
            val_x, val_label = val
            if torch.cuda.is_available():
                val_x = val_x.cuda()
                val_label =val_label.cuda()
            val_output = model(val_x)
            v_loss = criterion(val_output, val_label)
            val_loss += v_loss
            _, predicted=torch.max(val_output,1)
            mis_match+=np.count_nonzero(predicted.cpu().detach()==val_label.cpu().detach())
    del val_output
    del v_loss
    del predicted
    
    
    
    trn_loss_list.append(trn_loss/len(train_loader))
    val_loss_list.append(val_loss/len(test_loader))
    val_acc=mis_match/(len(test_loader)*batch_size)
    now = time.localtime()
    print ("%04d/%02d/%02d %02d:%02d:%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec))

    print("epoch: {}/{} | trn loss: {:.4f} | val loss: {:.4f} | val accuracy: {:.4f}% \n".format(
                epoch+1, total_epoch, trn_loss / len(train_loader), val_loss / len(test_loader), val_acc*100
            ))
    
    
    if epoch+1>2:
        if val_loss_list[-1]>val_loss_list[-2]:
            start_early_stop_check=1
    else:
        val_loss_min=val_loss_list[-1]
        
    if start_early_stop_check:
        early_stop_temp=val_loss_list[-patience:]
        if all(early_stop_temp[i]<early_stop_temp[i+1] for i in range (len(early_stop_temp)-1)):
            print("Early stop!")
            break
            
    if epoch+1>saving_start_epoch:
        if val_loss_list[-1]<val_loss_min:
            if os.path.isfile(model_name):
                os.remove(model_name)
            val_loss_min=val_loss_list[-1]
            model_name="RESNET_"+model_char+"_{:.3f}".format(val_loss_min)
            torch.save(model, model_name)
            print("Model replaced and saved as ",model_name)

2020/09/24 10:20:39
epoch: 1/200 | trn loss: 2.0826 | val loss: 1.8636 | val accuracy: 36.1426% 

2020/09/24 10:21:46
epoch: 2/200 | trn loss: 1.6363 | val loss: 1.4597 | val accuracy: 48.7256% 

2020/09/24 10:22:54
epoch: 3/200 | trn loss: 1.2699 | val loss: 1.2570 | val accuracy: 56.9678% 

2020/09/24 10:24:01
epoch: 4/200 | trn loss: 0.9399 | val loss: 1.0427 | val accuracy: 64.8535% 

2020/09/24 10:25:07
epoch: 5/200 | trn loss: 0.6664 | val loss: 0.9707 | val accuracy: 68.5352% 

2020/09/24 10:26:14
epoch: 6/200 | trn loss: 0.4955 | val loss: 0.8761 | val accuracy: 72.3682% 

2020/09/24 10:27:21
epoch: 7/200 | trn loss: 0.3906 | val loss: 0.8126 | val accuracy: 76.4941% 

2020/09/24 10:28:27
epoch: 8/200 | trn loss: 0.3222 | val loss: 0.7649 | val accuracy: 77.9346% 

2020/09/24 10:29:33
epoch: 9/200 | trn loss: 0.2801 | val loss: 0.6995 | val accuracy: 79.4238% 

2020/09/24 10:30:39
epoch: 10/200 | trn loss: 0.2552 | val loss: 0.7444 | val accuracy: 79.2529% 

2020/09/24 10:31:46

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Model replaced and saved as  RESNET_res152_0.710
2020/09/24 10:32:52
epoch: 12/200 | trn loss: 0.2245 | val loss: 0.6579 | val accuracy: 82.0410% 

Model replaced and saved as  RESNET_res152_0.658
2020/09/24 10:33:58
epoch: 13/200 | trn loss: 0.2094 | val loss: 0.6284 | val accuracy: 82.9785% 

Model replaced and saved as  RESNET_res152_0.628
2020/09/24 10:35:05
epoch: 14/200 | trn loss: 0.2025 | val loss: 0.6849 | val accuracy: 82.0508% 

2020/09/24 10:36:11
epoch: 15/200 | trn loss: 0.1918 | val loss: 0.5923 | val accuracy: 84.2578% 

Model replaced and saved as  RESNET_res152_0.592
2020/09/24 10:37:17
epoch: 16/200 | trn loss: 0.1847 | val loss: 0.5622 | val accuracy: 84.5508% 

Model replaced and saved as  RESNET_res152_0.562
2020/09/24 10:38:23
epoch: 17/200 | trn loss: 0.1728 | val loss: 0.6531 | val accuracy: 83.3057% 

2020/09/24 10:39:29
epoch: 18/200 | trn loss: 0.1647 | val loss: 0.6235 | val accuracy: 84.2285% 

2020/09/24 10:40:35
epoch: 19/200 | trn loss: 0.1712 | val los

In [17]:
fin_name="RESNET_fin"
torch.save(model, fin_name)
print("Fin model saved", fin_name)

Fin model saved RESNET_fin
