In [10]:
import torchvision.models as torch_models
import os

from helper import *

In [2]:
path = Path('/home/jupyter/Kaggle/kaggle_grapheme')
TRAIN_LABELS = path/'data/train.csv'
TRAIN_IMG_FILES = path/'data/'

In [3]:
df_train = pd.read_csv(TRAIN_LABELS)

In [4]:
df_train.head()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_0,15,9,5,ক্ট্রো
1,Train_1,159,0,0,হ
2,Train_2,22,3,5,খ্রী
3,Train_3,53,2,2,র্টি
4,Train_4,71,9,5,থ্রো


In [6]:
df_train.nunique()

image_id               200840
grapheme_root             168
vowel_diacritic            11
consonant_diacritic         7
grapheme                 1295
dtype: int64

In [7]:
tfms = get_transforms(do_flip=False,max_warp=0.,max_lighting=0.,p_lighting=0.)
stats = ([0.0692], [0.2051])
bs = 64

In [13]:
data = (ImageList
        .from_df(df_train,path=TRAIN_IMG_FILES,cols=0,convert_mode='L',suffix='.png')
        .split_by_rand_pct(seed=42)
        .label_from_df(cols=['grapheme_root','vowel_diacritic','consonant_diacritic'])
        .transform(tfms,size=(128,128),padding_mode='zeros')
        .databunch(bs=64,num_workers=os.cpu_count()*4)
        .normalize(stats)
       )

In [11]:
#data.show_batch(rows=3,figsize=(12,12))

In [14]:
#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
#implemented for PyTorch / FastAI by lessw2020 
#github: https://github.com/lessw2020/mish
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        #inlining this saves 1 second per epoch (V100 GPU) vs having a temp x and then returning x(!)
        return x * (torch.tanh(F.softplus(x)))

# Model 

1. Change the input conv_2d to take 1 channel grey scale image
2. Transfer learning, copy first conv_2d weights to the new conv2d, aggregate 3 channel weights as suggested from stackoverflow / pytorch forum posts. (Note: Divide by 3 '(R+G+B)/3' does not have a effect if you think about back propogation, constant are just 1 ? However, I am not sure about the math)
3. Multi - head: 

1. Inspired by APTOS and Retina net, it should be possible to output multi-results from the same backbone
2. As suggested by DrHB https://www.kaggle.com/c/bengaliai-cv19/discussion/123432, the additional conv2d layer is to give each head a possibility to update weights before pooling
3. Used AdaptiveConcatpool and standard fastai head structure
4. Used Mish activatation function instead of ReLU

In [15]:
class Model_Head(nn.Module):
    def __init__(self,ni,nc,ps=0.25):
        '''
        ni : input filter size
        nc : output class size
        ps : dropout rate
        '''
        super().__init__()
        layers = ([Mish(),conv2d(ni,ni),batchnorm_2d(ni),AdaptiveConcatPool2d(),Flatten()] 
                  + bn_drop_lin(ni*2,512,p=ps,actn=Mish()) 
                  + bn_drop_lin(512,nc,p=ps*2))
        self.head = nn.Sequential(*layers)
    
    def forward(self,xb):
        return self.head(xb)
    
class Resnet_1ch(nn.Module):
    def __init__(self,arch,nc=[168,11,7],pretrained=True):
        super().__init__()
        self.body = nn.Sequential(*list(arch(pretrained=pretrained).children())[:-2])
       
        # change input filter size to 1
        nf,ni,h,w = self.body[0].weight.shape
        w = self.body[0].weight.sum(dim=1,keepdim=True)
        conv_input = conv2d(1,nf,ks=h)
        conv_input.weight.data = w
        self.body[0] = conv_input
        
        # multi-head output
        # 168,11,7 from num of unique labels
        ni = num_features_model(self.body)
        self.head_grapheme = Model_Head(ni,nc[0])
        self.head_vowel = Model_Head(ni,nc[1])
        self.head_consonant = Model_Head(ni,nc[2])
    
    def forward(self,x):
        x = self.body(x)
        return (self.head_grapheme(x),self.head_vowel(x),self.head_consonant(x))
    
# replace all relu layer with Mish        
def to_mish(model):
    for name,child in model.named_children():
        if isinstance(child,nn.ReLU):
            setattr(model,name,Mish())
        else:
            to_mish(child)

In [16]:
class Loss_multi_head(nn.Module):
    def __init__(self,weights=[1,1,1]):
        super().__init__()
        
    def forward(self,preds,target,reduction='mean'):
        outp_1,outp_2,outp_3 = preds
        outp_1,outp_2,outp_3 = outp_1.float(),outp_2.float(),outp_3.float()
        target = target.long()
        return (
            weights[0] * F.cross_entropy(outp_1,target[:,0],reduction=reduction) 
            + weights[1] * F.cross_entropy(outp_2,target[:,1],reduction=reduction) 
            + weights[2] * F.cross_entropy(outp_3,target[:,2],reduction=reduction)
               )

In [17]:
Metric_grapheme = partial(Metric_idx,0)
Metric_vowel = partial(Metric_idx,1)
Metric_consonant = partial(Metric_idx,2)

In [20]:
model = Resnet_1ch(torch_models.resnet34)

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /home/jupyter/.cache/torch/checkpoints/resnet34-333f7ec4.pth
100%|██████████| 83.3M/83.3M [00:02<00:00, 39.6MB/s]


In [18]:
loss_func = Loss_multi_head([0.5,0.25,0.25])

In [21]:
learn = Learner(data,model,loss_func=loss_func,
                metrics=[Metric_grapheme(),Metric_vowel(),Metric_consonant(),Metric_tot()],
                model_dir=path/'models/resnet').to_fp16()