In [30]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, GPT2Model, GPT2Tokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [31]:
# Setting up the device for GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
import pandas as pd
#load dataset
train_dataset = pd.read_csv('../Dataset/NLI_dataset/snli_1.0_train.txt', sep='\t')
valid_dataset = pd.read_csv('../Dataset/NLI_dataset/snli_1.0_dev.txt', sep='\t')
test_dataset = pd.read_csv('../Dataset/NLI_dataset/snli_1.0_test.txt', sep='\t')

In [4]:
#Get neccesary columns
# label, premise, hypothesis
df_train = train_dataset[['gold_label','sentence1','sentence2']]
df_dev = valid_dataset[['gold_label','sentence1','sentence2']]
df_test = test_dataset[['gold_label','sentence1','sentence2']]

df_train = df_train[:300000]
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

(300000, 3)
(10000, 3)
(10000, 3)


In [5]:
# filtering the rows where label is not valid
df_train = df_train[~df_train['gold_label'].str.contains('-')]
df_dev = df_dev[~df_dev['gold_label'].str.contains('-')]
df_test = df_test[~df_test['gold_label'].str.contains('-')]

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

(299619, 3)
(9842, 3)
(9824, 3)


In [6]:
#Check longest string in Phrase
print(df_train.sentence1.str.len().max())
print(df_train.sentence2.str.len().max())

print(df_dev.sentence1.str.len().max())
print(df_dev.sentence2.str.len().max())

print(df_test.sentence1.str.len().max())
print(df_test.sentence2.str.len().max())

402
279.0
300
232
265
159


In [7]:
# remove all strings greater than 64
df_train = df_train[~df_train['sentence1'].str.len().ge(64)]
df_train = df_train[~df_train['sentence2'].str.len().ge(64)]

df_dev = df_dev[~df_dev['sentence1'].str.len().ge(64)]
df_dev = df_dev[~df_dev['sentence2'].str.len().ge(64)]

df_test = df_test[~df_test['sentence1'].str.len().ge(64)]
df_test = df_test[~df_test['sentence2'].str.len().ge(64)]

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)


(162333, 3)
(4678, 3)
(4639, 3)


In [8]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column for gold_label
df_train['gold_label']= label_encoder.fit_transform(df_train['gold_label'])
print(df_train['gold_label'].unique())
# Encode labels in column for gold_label
df_dev['gold_label']= label_encoder.fit_transform(df_dev['gold_label'])
print(df_dev['gold_label'].unique())
# Encode labels in column for gold_label
df_test['gold_label']= label_encoder.fit_transform(df_test['gold_label'])
print(df_test['gold_label'].unique())

[2 0 1]
[1 0 2]
[2 1 0]


In [9]:
# from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

In [10]:
import string

def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:64]
        return " ".join(sent)
    except:
        return sent

def NLIData( dataframe, tokenizer, max_len):
    df = pd.DataFrame()

    for index in dataframe.index:

        premise = str(dataframe['sentence1'][index])
        premise = trim_sentence(premise.translate(str.maketrans('', '', string.punctuation))) + " . "
        
        hypothesis = str(dataframe['sentence2'][index])
        hypothesis = trim_sentence(hypothesis.translate(str.maketrans('', '', string.punctuation)))
        label = dataframe['gold_label'][index]

        # tokenize input
        tokenized_input_seq_pair = tokenizer(
            premise,
            hypothesis,
            max_length=max_len,
            pad_to_max_length= True,
            return_token_type_ids=True,
            return_tensors='pt',
            truncation=True,
            )
    

        ids = tokenized_input_seq_pair['input_ids']
        mask = tokenized_input_seq_pair['attention_mask']
        token_type_ids = tokenized_input_seq_pair["token_type_ids"]
        text = premise + hypothesis
      
        df = df.append({
            'text': text, 
            'ids': ids.flatten(),
            'mask': mask.flatten(),
            'tti': token_type_ids.flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }, ignore_index=True)
    return df

In [11]:
#Check longest string in Phrase
print(df_train.sentence1.str.len().max())
print(df_train.sentence2.str.len().max())

print(df_dev.sentence1.str.len().max())
print(df_dev.sentence2.str.len().max())

print(df_test.sentence1.str.len().max())
print(df_test.sentence2.str.len().max())

63
63.0
63
63
63
63


In [12]:
print(df_train[df_train['sentence1'] == ' '].index)
print(df_train[df_train['sentence2'] == ' '].index)

print(df_dev[df_dev['sentence1'] == ' '].index)
print(df_dev[df_dev['sentence2'] == ' '].index)

print(df_test[df_test['sentence1'] == ' '].index)
print(df_test[df_test['sentence2'] == ' '].index)

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')


In [13]:
# Defining some key variables that will be used later on in the training
import warnings
warnings.filterwarnings("ignore")

MAX_LEN = 128
LEARNING_RATE = 5e-5

# get the sets 
train_set = NLIData(df_train, tokenizer, MAX_LEN)
valid_set = NLIData(df_dev, tokenizer, MAX_LEN)
test_set = NLIData(df_test, tokenizer, MAX_LEN)

print("FULL Dataset: {}".format(train_set.shape))
print("TRAIN Dataset: {}".format(valid_set.shape))
print("TEST Dataset: {}".format(test_set.shape))

FULL Dataset: (162333, 5)
TRAIN Dataset: (4678, 5)
TEST Dataset: (4639, 5)


In [14]:
#Convert dataframe to dataset
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        ids, mask, label, tti= row["ids"], row["mask"], row["labels"], row["tti"]
        return {"ids": ids, 
                "mask":mask, 
                "label":label,
                "tti": tti
                }

train_dataset = PandasDataset(train_set)
valid_dataset = PandasDataset(valid_set)
test_dataset = PandasDataset(test_set)

In [15]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_loader = DataLoader(train_dataset, **train_params)
valid_loader = DataLoader(valid_dataset, **val_params)
test_loader = DataLoader(test_dataset, **test_params)

In [16]:
class GPT2Class(torch.nn.Module):
    def __init__(self):
        super(GPT2Class, self).__init__()
        self.l1 = GPT2Model.from_pretrained("gpt2")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [17]:
model = GPT2Class()
model.to(device)

GPT2Class(
  (l1): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inp

In [18]:
# Creating the loss function and optimizer
EPOCHS = 1

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
# Function for a single training iteration
def train_epoch(model, training_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    step = 0
    for d in tqdm(training_loader):
        step = step+1
        input_ids = d["ids"].to(device)
        attention_mask = d["mask"].to(device)
        targets = d["label"].to(device)
        tti = d["tti"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=tti
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        #scheduler.step()
        optimizer.zero_grad()
    
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [20]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["ids"].to(device)
            attention_mask = d["mask"].to(device)
            targets = d["label"].to(device)
            tti = d["tti"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=tti
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [21]:
%%time
import warnings
warnings.filterwarnings("ignore")

from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_set)
    )
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        valid_loader,
        loss_fn,
        device,
        len(valid_set)
    )
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'GPT2_NLI_Model')
        best_accuracy = val_acc

Epoch 1/1
----------


100%|██████████| 10146/10146 [21:50<00:00,  7.74it/s]


Train loss 1.1023213736949389 accuracy 0.336918556300937


100%|██████████| 293/293 [00:10<00:00, 27.13it/s]


Val   loss 1.0985935369042406 accuracy 0.3430953398888414

CPU times: user 12min 41s, sys: 9min 28s, total: 22min 9s
Wall time: 22min 2s


In [39]:
device

device(type='cuda')

In [40]:
model.load_state_dict(torch.load('GPT2_NLI_Model'))
model = model.to(device)

In [26]:
# Get model performance (accuracy and loss)
test_acc, test_loss = eval_model(
  model,
  test_loader,
  loss_fn,
  device,
  len(test_set)
)
test_acc.item()

100%|██████████| 290/290 [00:10<00:00, 28.06it/s]


0.34490191851692176

In [46]:
def predict_inference(premise, hypothesis, model, device):
  
    premise = trim_sentence(premise.translate(str.maketrans('', '', string.punctuation))) + " . " 
    hypothesis = trim_sentence(hypothesis.translate(str.maketrans('', '', string.punctuation)))

    tokenized_input_seq_pair = tokenizer(
        premise, 
        hypothesis,
        pad_to_max_length = True,
        max_length=MAX_LEN,
        return_token_type_ids=True,
        truncation=True,
        return_tensors='pt'
    )
    
    ids = tokenized_input_seq_pair['input_ids']
    mask = tokenized_input_seq_pair['attention_mask']
    tti = tokenized_input_seq_pair["token_type_ids"]
    text = premise + ". " + hypothesis
  
    LABEL = ['contradiction', 'entailment','neutral']
    model.eval()

    with torch.no_grad():
        sequence = ids.to(device)
        attn_mask = mask.to(device)
        tti = tti.to(device)
        prediction = model(sequence, attn_mask, tti)
        prediction = prediction.argmax(dim=-1).item()
    return LABEL[prediction]

In [47]:
premise = 'Children smiling and waving at camera'
hypothesis = 'There are children present'
predict_inference(premise, hypothesis, model, device)

'contradiction'

In [48]:
premise = 'I am using mobile phone.'
hypothesis = 'I have mobile in my hand.'

predict_inference(premise, hypothesis, model, device)

'entailment'