In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/contradictory-my-dear-watson/test.csv


In [2]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


## Downloading Data

In [3]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

In [4]:
labels, frequencies = np.unique(train.language.values, return_counts = True)

In [5]:
device = torch.device("cuda:0")



In [6]:
premises = list(train['premise'])
hypotheses = list(train['hypothesis'])
labels = list(train['label'])

In [7]:
class Model(nn.Module):

    def __init__(self, lmbda=1e-7):
        super(Model, self).__init__()
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
        self.net = nn.Linear(768, 3).to(device)
        self.sm = nn.Softmax(dim=1)
        
        self.optim = optim.SGD(self.net.parameters(), lr=0.01, momentum=0.9)
        
        self.lmbda = lmbda
        
        self.alphas = []
        self.ws = []
        self.zs = []
        self.dz_dalphas = []
        self.bert_orig = []
        for p in self.bert.parameters():
            self.alphas.append(torch.zeros_like(p.data, requires_grad=True) + 5)
            self.alphas[-1].retain_grad()
            self.ws.append(torch.zeros_like(p.data))
            self.zs.append(torch.zeros_like(p.data))
            self.dz_dalphas.append(torch.zeros_like(p.data))
            self.bert_orig.append(torch.zeros_like(p.data).copy_(p))
        
    def z_func(self, alpha, l=-1.5, r=1.5):
        u = torch.zeros_like(alpha).uniform_().clamp_(0.0001, 0.9999)
        s = (torch.sigmoid(u.log() - (1 - u).log() + alpha)).detach()
        u = s * (r - l) + l
        t = u.clamp(0, 1000)
        z = t.clamp(-1000, 1)
        dz_dt = (t < 1).float().to(alpha.device).detach()
        dt_du = (u > 0).float().to(alpha.device).detach()
        du_ds = r - l
        ds_dalpha = (s * (1 - s)).detach()
        dz_dalpha = dz_dt * dt_du * du_ds * ds_dalpha
        return z.detach(), dz_dalpha.detach()
        
    def step(self, lr=0.01):
        bert_params = list(self.bert.parameters())
        for i in range(len(bert_params)):
            grad = bert_params[i].grad
            grad_w = self.zs[i] * grad
            grad_alpha = self.ws[i] * self.dz_dalphas[i] * grad
            self.ws[i] -= lr * grad_w
            self.alphas[i] -= lr * (grad_alpha + self.lmbda * self.alphas[i].grad) 
            z, dz_da = self.z_func(self.alphas[i])
            self.zs[i] = z
            self.dz_dalphas[i] = dz_da
            bert_params[i].data = bert_params[i].data.copy_(self.bert_orig[i]) + z * self.ws[i]
            
            bert_params[i].grad.detach_()
            bert_params[i].grad.zero_()
            self.alphas[i].grad.detach_()
            self.alphas[i].grad.zero_()
            
            self.optim.step()
            self.optim.zero_grad()
    
            
    
    def apply(self, premises, hypotheses):
        tokenized = [self.tokenizer.encode(prem, hyp, add_special_tokens=True) for prem, hyp in zip(premises, hypotheses)]
        max_len = 0
        for i in tokenized:
            if len(i) > max_len:
                max_len = len(i)
        padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])
        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)
#         print(torch.cuda.memory_stats(device)['allocated_bytes.all.current']/1024/1024/1024)
        
        last_hidden_states = self.bert(input_ids, attention_mask=attention_mask)
        
#         print(torch.cuda.memory_stats(device)['allocated_bytes.all.current']/1024/1024/1024)
        hidden = last_hidden_states[0][:,0,:]
        return self.sm(self.net(hidden))
    
    
    def alpha_reg(self):
        for a in self.alphas:
            torch.sigmoid(a).sum().backward()

In [8]:
model = Model()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [9]:
loss_f = nn.CrossEntropyLoss()
batch_size = 64

for epoch in range(10):
    means = []
    for i in range(len(premises) // batch_size + 1):
        output = model.apply(premises[i * batch_size: (i+1) * batch_size], hypotheses[i * batch_size: (i+1) * batch_size])
        y = torch.Tensor(labels[i * batch_size: (i+1) * batch_size]).type(torch.LongTensor).to(device)
        loss = loss_f(output, y)
        means.append(torch.mean((torch.argmax(output, dim=1)==y).type(torch.FloatTensor)))
        loss.backward()
        model.alpha_reg()
        model.step()
        if(i%10 == 0):
            print(epoch, i, ": ", np.mean(means[-100:]))

0 0 :  0.15625
0 10 :  0.328125
0 20 :  0.33035713
0 30 :  0.3266129
0 40 :  0.3285061
0 50 :  0.33180147
0 60 :  0.33478484
0 70 :  0.33758804
0 80 :  0.33738425
0 90 :  0.3372253
0 100 :  0.34046876
0 110 :  0.3421875
0 120 :  0.34234375
0 130 :  0.345625
0 140 :  0.34515625
0 150 :  0.34609374
0 160 :  0.3446875
0 170 :  0.3459375
0 180 :  0.34875
1 0 :  0.3125
1 10 :  0.36789772
1 20 :  0.3705357
1 30 :  0.3719758
1 40 :  0.37271342
1 50 :  0.37683824
1 60 :  0.38755122
1 70 :  0.3912852
1 80 :  0.3940972
1 90 :  0.4002404
1 100 :  0.40421876
1 110 :  0.41
1 120 :  0.41546875
1 130 :  0.42171875
1 140 :  0.43046874
1 150 :  0.4359375
1 160 :  0.43390626
1 170 :  0.43859375
1 180 :  0.443125
2 0 :  0.375
2 10 :  0.44460228
2 20 :  0.43973213
2 30 :  0.45060483
2 40 :  0.45541158
2 50 :  0.4574142
2 60 :  0.46849385
2 70 :  0.4740317
2 80 :  0.47878087
2 90 :  0.48094094
2 100 :  0.483125
2 110 :  0.48875
2 120 :  0.494375
2 130 :  0.498125
2 140 :  0.5053125
2 150 :  0.5101563
2 160

In [10]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [11]:
result = []

In [12]:
for i in range(len(test.premise.values) // batch_size + 1):
    print(i, '/', len(test.premise.values) // batch_size)
    with torch.no_grad():
        output = model.apply(test.premise.values[i * batch_size: (i+1) * batch_size], test.hypothesis.values[i * batch_size: (i+1) * batch_size])
    result += list((torch.argmax(output, dim=1)).cpu().numpy())

0 / 81
1 / 81
2 / 81
3 / 81
4 / 81
5 / 81
6 / 81
7 / 81
8 / 81
9 / 81
10 / 81
11 / 81
12 / 81
13 / 81
14 / 81
15 / 81
16 / 81
17 / 81
18 / 81
19 / 81
20 / 81
21 / 81
22 / 81
23 / 81
24 / 81
25 / 81
26 / 81
27 / 81
28 / 81
29 / 81
30 / 81
31 / 81
32 / 81
33 / 81
34 / 81
35 / 81
36 / 81
37 / 81
38 / 81
39 / 81
40 / 81
41 / 81
42 / 81
43 / 81
44 / 81
45 / 81
46 / 81
47 / 81
48 / 81
49 / 81
50 / 81
51 / 81
52 / 81
53 / 81
54 / 81
55 / 81
56 / 81
57 / 81
58 / 81
59 / 81
60 / 81
61 / 81
62 / 81
63 / 81
64 / 81
65 / 81
66 / 81
67 / 81
68 / 81
69 / 81
70 / 81
71 / 81
72 / 81
73 / 81
74 / 81
75 / 81
76 / 81
77 / 81
78 / 81
79 / 81
80 / 81
81 / 81


In [13]:
submission = test.id.copy().to_frame()
submission['prediction'] = result

In [14]:
submission.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,0
3,58518c10ba,1
4,c32b0d16df,2


In [15]:
submission.to_csv("submission.csv", index = False)