In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

!pip install transformers
!pip install wget
!pip install jsonlines

import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import wget
import json
import jsonlines
import os

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 312kB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 1.7MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 5.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |█████

Using TensorFlow backend.


In [6]:
#########   Setting GPU device   #########

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)


cuda


In [3]:
##########   Download SNLI dataset   ############

snli_url = "https://nlp.stanford.edu/projects/snli/snli_1.0.zip" 
if not os.path.exists('./snli_1.0.zip'):
    wget.download(snli_url, './snli_1.0.zip')

# Unzip the dataset
if not os.path.exists('./snli_1.0/'):
    !unzip snli_1.0.zip

def processed_snli_data(path):
    hyp_prem = []
    labels = []
    with jsonlines.open(path, "r") as f:
        for line in f.iter():
            json_string = json.dumps(line)
            ex = json.loads(json_string)
            if ex['gold_label'] != "-":
                hyp_prem.append(ex['sentence1'] + " " + ex['sentence2'])
                labels.append(ex['gold_label'])

    # hyp_prem = hyp_prem[:160000]
    # labels = labels[:160000]

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)
    
    print(hyp_prem[0])
    print(tokenizer.tokenize(hyp_prem[0]))

    sent = []
    for sentence in hyp_prem:
        s = tokenizer.encode(sentence, add_special_tokens=True)
        sent.append(s)

    print(sent[0])
    return sent, labels

def snli_dataloader(sentence, lab, turn):
    max_len = 112
    labels = []
    for y in lab:
        if y == 'neutral':
            labels.append(0)
        elif y == 'entailment':
            labels.append(1)
        elif y == 'contradiction':
            labels.append(2)

    sentence = pad_sequences(sentence, maxlen=max_len, dtype="long",
                        value=0, truncating="post", padding="post")
    print(sentence[0])
    
    mask = []
    for s in sentence:
        m = [int(word_id > 0) for word_id in s]
        mask.append(m)
    print(mask[2])  # contains 1 for original words, 0 for padded words
    test_size = 0.1 if turn == "train" else 0.01

    train_set, test_set, train_labels, test_labels = train_test_split(
        sentence, labels, random_state=2018, test_size=test_size
    )
    
    train_mask, test_mask, _, _ = train_test_split(mask, labels, 
                                                random_state=2018, 
                                                test_size=test_size)

    print("train set size",len(train_set), len(train_labels))  # 0.6 times original num of sentences
    print("test set size", len(test_set))   # 0.4 times original num of sentences
    print()

    ########   convert NumPy arrays to Tensor data  ########
    print(train_set.shape)
    print(train_labels[:10])
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)

    train_set = torch.tensor(train_set)
    test_set = torch.tensor(test_set)
    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    train_mask = torch.tensor(train_mask)
    test_mask = torch.tensor(test_mask)

    print("train data shape", train_set.shape)
    print("test data shape", train_labels.shape)
    print("train mask shape", train_mask.shape)

    #######   Dataloader to load train and test data in batches  ########

    batch_size = 32  # Recommended in paper
    train = TensorDataset(train_set, train_mask, train_labels)
    sampler = RandomSampler(train)
    train_loader = DataLoader(train, batch_size=batch_size, sampler=sampler)

    test = TensorDataset(test_set, test_mask, test_labels)
    samp = RandomSampler(test)
    test_loader = DataLoader(test, batch_size=batch_size, sampler=samp)

    return train_loader, test_loader


Archive:  snli_1.0.zip
   creating: snli_1.0/
  inflating: snli_1.0/.DS_Store      
   creating: __MACOSX/
   creating: __MACOSX/snli_1.0/
  inflating: __MACOSX/snli_1.0/._.DS_Store  
 extracting: snli_1.0/Icon           
  inflating: __MACOSX/snli_1.0/._Icon  
  inflating: snli_1.0/README.txt     
  inflating: __MACOSX/snli_1.0/._README.txt  
  inflating: snli_1.0/snli_1.0_dev.jsonl  
  inflating: snli_1.0/snli_1.0_dev.txt  
  inflating: snli_1.0/snli_1.0_test.jsonl  
  inflating: snli_1.0/snli_1.0_test.txt  
  inflating: snli_1.0/snli_1.0_train.jsonl  
  inflating: snli_1.0/snli_1.0_train.txt  
  inflating: __MACOSX/._snli_1.0     


In [4]:
train_sent, labels = processed_snli_data("./snli_1.0/snli_1.0_train.jsonl")
print(len(train_sent))
print('Max sentence length: ', max([len(sen) for sen in train_sent]))
snli_trainloader, _ = snli_dataloader(train_sent, labels, "train")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


A person on a horse jumps over a broken down airplane. A person is training his horse for a competition.
['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.', 'a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.']
[101, 1037, 2711, 2006, 1037, 3586, 14523, 2058, 1037, 3714, 2091, 13297, 1012, 1037, 2711, 2003, 2731, 2010, 3586, 2005, 1037, 2971, 1012, 102]
549367
Max sentence length:  124
[  101  1037  2711  2006  1037  3586 14523  2058  1037  3714  2091 13297
  1012  1037  2711  2003  2731  2010  3586  2005  1037  2971  1012   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0 

In [7]:
#######   define the model   ###########
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False,
)


bert_model = bert_model.to(device)


In [8]:
########  define optimizer, write accuracy fn  ###########

optimizer = AdamW(bert_model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 4
loss_arr = []

def compute_accuracy(preds, targets):
    return (torch.argmax(preds, dim=1) == targets).float().mean().item()
    

In [11]:
###############   TRAINING OF BERT MODEL  ##################

for ep in range(num_epochs):
    bert_model.train()
    epoch_loss = 0

    for i, data in enumerate(snli_trainloader):
        batch_data = data[0].to(device).long()
        batch_mask = data[1].to(device).long()
        batch_labels = data[2].to(device).long()

        bert_model.zero_grad()
        pred = bert_model(batch_data,
                             token_type_ids=None,
                             attention_mask=batch_mask,
                             labels=batch_labels)
        
        # As we call the model with labels, it returns the loss in a tuple
        loss = pred[0]  
        epoch_loss += loss.item()
        loss.backward()  # Backprpagation

        # Clip Gradient norm to mitigate exploding of gradients
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
        optimizer.step()

    epoch_loss /= len(snli_trainloader)
    print("train loss after %d epochs is %f " %(ep+1, epoch_loss))
    loss_arr.append(epoch_loss)



In [12]:
##############  Load test data  ####################

test_sent, labels = processed_snli_data("./snli_1.0/snli_1.0_test.jsonl")
print(len(test_sent))
print('Max sentence length: ', max([len(sen) for sen in test_sent]))
snli_testloader, _ = snli_dataloader(test_sent, labels, "test")

print(len(snli_testloader))
test_acc = 0.0
steps = 0


This church choir sings to the masses as they sing joyous songs from the book at a church. The church has cracks in the ceiling.
['this', 'church', 'choir', 'sings', 'to', 'the', 'masses', 'as', 'they', 'sing', 'joy', '##ous', 'songs', 'from', 'the', 'book', 'at', 'a', 'church', '.', 'the', 'church', 'has', 'cracks', 'in', 'the', 'ceiling', '.']
[101, 2023, 2277, 6596, 10955, 2000, 1996, 11678, 2004, 2027, 6170, 6569, 3560, 2774, 2013, 1996, 2338, 2012, 1037, 2277, 1012, 1996, 2277, 2038, 15288, 1999, 1996, 5894, 1012, 102]
9824
Max sentence length:  75
[  101  2023  2277  6596 10955  2000  1996 11678  2004  2027  6170  6569
  3560  2774  2013  1996  2338  2012  1037  2277  1012  1996  2277  2038
 15288  1999  1996  5894  1012   102     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
  

In [None]:
#########   Performance over Test set  ##############

bert_model.eval()
for batch in snli_testloader:
    batch = tuple(t.to(device) for t in batch)
    batch_data, batch_mask, batch_labels = batch

    with torch.no_grad():
        preds = bert_model(batch_data,
                                token_type_ids=None,
                                attention_mask=batch_mask)

    logits = preds[0]
    logits = logits.detach().cpu()
    targets = batch_labels.to('cpu')
    
    acc = compute_accuracy(logits, targets)
    test_acc += acc
    steps += 1

print("final test set accuracy is ", (test_acc / steps))
