In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import  RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
# Setting up the device for GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

--2022-12-22 08:29:43--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip’


2022-12-22 08:29:46 (35.2 MB/s) - ‘snli_1.0.zip’ saved [94550081/94550081]



In [5]:
from zipfile import ZipFile
# specifying the zip file name
file_name = "snli_1.0.zip"
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()
    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
snli_1.0/                                      2015-08-29 08:57:10            0
snli_1.0/.DS_Store                             2015-08-29 08:57:16         6148
__MACOSX/                                      2015-08-29 09:00:04            0
__MACOSX/snli_1.0/                             2015-08-29 09:00:04            0
__MACOSX/snli_1.0/._.DS_Store                  2015-08-29 08:57:16          120
snli_1.0/Icon                                 2015-05-21 16:21:08            0
__MACOSX/snli_1.0/._Icon                      2015-05-21 16:21:08       340709
snli_1.0/README.txt                            2015-08-29 08:59:48         5828
__MACOSX/snli_1.0/._README.txt                 2015-08-29 08:59:48          171
snli_1.0/snli_1.0_dev.jsonl                    2015-08-17 10:34:22      9745714
snli_1.0/snli_1.0_dev.txt                      2015-08-17 10:34:24      7565773
snli_1.0/snli_1.0_test.jsonl            

In [6]:
import pandas as pd
#load dataset
train_dataset = pd.read_csv('snli_1.0/snli_1.0_train.txt', sep='\t')
valid_dataset = pd.read_csv('snli_1.0/snli_1.0_dev.txt', sep='\t')
test_dataset = pd.read_csv('snli_1.0/snli_1.0_test.txt', sep='\t')

In [7]:
#Get neccesary columns
# label, premise, hypothesis
df_train = train_dataset[['gold_label','sentence1','sentence2']]
df_dev = valid_dataset[['gold_label','sentence1','sentence2']]
df_test = test_dataset[['gold_label','sentence1','sentence2']]

df_train = df_train[:150000]
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

(150000, 3)
(10000, 3)
(10000, 3)


In [8]:
# filtering the rows where label is not valid
df_train = df_train[~df_train['gold_label'].str.contains('-')]
df_dev = df_dev[~df_dev['gold_label'].str.contains('-')]
df_test = df_test[~df_test['gold_label'].str.contains('-')]

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

(149823, 3)
(9842, 3)
(9824, 3)


In [9]:
#Check longest string in Phrase
print(df_train.sentence1.str.len().max())
print(df_train.sentence2.str.len().max())

print(df_dev.sentence1.str.len().max())
print(df_dev.sentence2.str.len().max())

print(df_test.sentence1.str.len().max())
print(df_test.sentence2.str.len().max())

402
204.0
300
232
265
159


In [10]:
# remove all strings greater than 64
df_train = df_train[~df_train['sentence1'].str.len().ge(64)]
df_train = df_train[~df_train['sentence2'].str.len().ge(64)]

df_dev = df_dev[~df_dev['sentence1'].str.len().ge(64)]
df_dev = df_dev[~df_dev['sentence2'].str.len().ge(64)]

df_test = df_test[~df_test['sentence1'].str.len().ge(64)]
df_test = df_test[~df_test['sentence2'].str.len().ge(64)]

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)


(81057, 3)
(4678, 3)
(4639, 3)


In [11]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column for gold_label
df_train['gold_label']= label_encoder.fit_transform(df_train['gold_label'])
print(df_train['gold_label'].unique())
# Encode labels in column for gold_label
df_dev['gold_label']= label_encoder.fit_transform(df_dev['gold_label'])
print(df_dev['gold_label'].unique())
# Encode labels in column for gold_label
df_test['gold_label']= label_encoder.fit_transform(df_test['gold_label'])
print(df_test['gold_label'].unique())

[2 0 1]
[1 0 2]
[2 1 0]


In [12]:
# from transformers import GPT2Tokenizer, GPT2Model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [13]:
import string

def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:64]
        return " ".join(sent)
    except:
        return sent

def NLIData( dataframe, tokenizer, max_len):
    df = pd.DataFrame()

    for index in dataframe.index:

        premise = str(dataframe['sentence1'][index])
        premise = trim_sentence(premise.translate(str.maketrans('', '', string.punctuation))) + " . "
        
        hypothesis = str(dataframe['sentence2'][index])
        hypothesis = trim_sentence(hypothesis.translate(str.maketrans('', '', string.punctuation)))
        label = dataframe['gold_label'][index]

        # tokenize input
        tokenized_input_seq_pair = tokenizer(
            premise,
            hypothesis,
            max_length=max_len,
            pad_to_max_length= True,
            return_token_type_ids=True,
            return_tensors='pt',
            truncation=True,
            )
    

        ids = tokenized_input_seq_pair['input_ids']
        mask = tokenized_input_seq_pair['attention_mask']
        token_type_ids = tokenized_input_seq_pair["token_type_ids"]
        text = premise + hypothesis
      
        df = df.append({
            'text': text, 
            'ids': ids.flatten(),
            'mask': mask.flatten(),
            'tti': token_type_ids.flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }, ignore_index=True)
    return df

In [14]:
#Check longest string in Phrase
print(df_train.sentence1.str.len().max())
print(df_train.sentence2.str.len().max())

print(df_dev.sentence1.str.len().max())
print(df_dev.sentence2.str.len().max())

print(df_test.sentence1.str.len().max())
print(df_test.sentence2.str.len().max())

63
63.0
63
63
63
63


In [15]:
print(df_train[df_train['sentence1'] == ' '].index)
print(df_train[df_train['sentence2'] == ' '].index)

print(df_dev[df_dev['sentence1'] == ' '].index)
print(df_dev[df_dev['sentence2'] == ' '].index)

print(df_test[df_test['sentence1'] == ' '].index)
print(df_test[df_test['sentence2'] == ' '].index)

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')


In [16]:
# Defining some key variables that will be used later on in the training
import warnings
warnings.filterwarnings("ignore")

MAX_LEN = 128
LEARNING_RATE = 5e-5

# get the sets 
train_set = NLIData(df_train, tokenizer, MAX_LEN)
valid_set = NLIData(df_dev, tokenizer, MAX_LEN)
test_set = NLIData(df_test, tokenizer, MAX_LEN)

print("FULL Dataset: {}".format(train_set.shape))
print("TRAIN Dataset: {}".format(valid_set.shape))
print("TEST Dataset: {}".format(test_set.shape))

FULL Dataset: (81057, 5)
TRAIN Dataset: (4678, 5)
TEST Dataset: (4639, 5)


In [17]:
#Convert dataframe to dataset
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        ids, mask, label, tti= row["ids"], row["mask"], row["labels"], row["tti"]
        return {"ids": ids, 
                "mask":mask, 
                "label":label,
                "tti": tti
                }

train_dataset = PandasDataset(train_set)
valid_dataset = PandasDataset(valid_set)
test_dataset = PandasDataset(test_set)

In [18]:
TRAIN_BATCH_SIZE = 24
VALID_BATCH_SIZE = 24

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(train_dataset, **train_params)
valid_loader = DataLoader(valid_dataset, **val_params)
test_loader = DataLoader(test_dataset, **test_params)

In [19]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-large")
        self.classifier = torch.nn.Linear(1024, 3)

    def forward(self, input_ids, attention_mask):
        embedded = self.l1(input_ids=input_ids, attention_mask=attention_mask)[1]
        output = self.classifier(embedded)
        return output

In [20]:
model = RobertaClass()
model.to(device)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNo

In [21]:
# Creating the loss function and optimizer
EPOCHS = 1

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [22]:
# Function for a single training iteration
def train_epoch(model, training_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    step = 0
    for d in tqdm(training_loader):
        step = step+1
        input_ids = d["ids"].to(device)
        attention_mask = d["mask"].to(device)
        targets = d["label"].to(device)
        #tti = d["tti"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            #token_type_ids=tti
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        #scheduler.step()
        optimizer.zero_grad()
    
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [23]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["ids"].to(device)
            attention_mask = d["mask"].to(device)
            targets = d["label"].to(device)
            #tti = d["tti"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids,
                attention_mask,
                #tti
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [24]:
%%time
import warnings
warnings.filterwarnings("ignore")

from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_set)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        valid_loader,
        loss_fn,
        device,
        len(valid_set)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'RoBERTa_NLI_Model')
        best_accuracy = val_acc

Epoch 1/1
----------


100%|██████████| 3378/3378 [23:23<00:00,  2.41it/s]


Train loss 1.1064736497797438 accuracy 0.33271648346225496


100%|██████████| 195/195 [00:26<00:00,  7.40it/s]


Val   loss 1.1032189460901114 accuracy 0.3428815733219324

CPU times: user 17min 31s, sys: 6min 25s, total: 23min 56s
Wall time: 23min 51s


In [25]:
# download checkpoint file
from google.colab import files
files.download('RoBERTa_NLI_Model')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
#Times
# BERT
# RoBERTa
# GPT2
# GPTNeo

In [27]:
# model.load_state_dict(torch.load('RoBERTa_NLI_Model'))
# model = model.to(device)

In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [29]:
PATH = F"/content/gdrive/My Drive/Models/BERT_NLI_Model"
torch.save(model.state_dict(), PATH)

In [30]:
# Get model performance (accuracy and loss)
test_acc, test_loss = eval_model(
  model,
  test_loader,
  loss_fn,
  device,
  len(test_set)
)
test_acc.item()

100%|██████████| 194/194 [00:26<00:00,  7.39it/s]


0.3459797370122871

In [31]:
#0.346

In [32]:
def predict_inference(premise, hypothesis, model, device):
  
    premise = trim_sentence(premise.translate(str.maketrans('', '', string.punctuation))) + " . " 
    hypothesis = trim_sentence(hypothesis.translate(str.maketrans('', '', string.punctuation)))

    tokenized_input_seq_pair = tokenizer(
        premise, 
        hypothesis,
        pad_to_max_length = True,
        max_length=MAX_LEN,
        return_token_type_ids=True,
        truncation=True,
        return_tensors='pt'
    )
    
    ids = tokenized_input_seq_pair['input_ids']
    mask = tokenized_input_seq_pair['attention_mask']
    #tti = tokenized_input_seq_pair["token_type_ids"]
    text = premise + ". " + hypothesis
  
    LABEL = ['contradiction', 'entailment','neutral']
    model.eval()

    with torch.no_grad():
        sequence = ids.to(device)
        attn_mask = mask.to(device)
        #tti = tti.to(device)
        prediction = model(sequence, attn_mask)
        prediction = prediction.argmax(dim=-1).item()
    return LABEL[prediction]

In [33]:
premise = 'Children smiling and waving at camera'
hypothesis = 'There are children present'
predict_inference(premise, hypothesis, model, device)

'entailment'

In [34]:
premise = 'I am using mobile phone.'
hypothesis = 'I have mobile in my hand.'

predict_inference(premise, hypothesis, model, device)

'entailment'