In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BertModel
from transformers import BertTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import re

# Dataset

In [5]:
data = pd.read_csv('data/created_datasets/humicroedit_unpaired_train.csv')
data.head()

Unnamed: 0,text,meanGrade,score_std,basic_score,scaled_mean
0,Trump cites fake boob to make the case that su...,1.166667,0.687184,0.0,0.388889
1,Louisiana school district : All students must ...,1.0,0.632456,0.0,0.333333
2,Green receives standing ovation at ‘ The Colo...,1.6,1.2,1.0,0.533333
3,Judge Orders State Department To Provide Withh...,1.0,0.632456,0.0,0.333333
4,ATT Loses Another 1.36 Million Pay TV Subscrib...,1.4,0.8,0.0,0.466667


In [6]:
data.shape

(21990, 5)

In [7]:
sent = data.loc[0].text
sent

'Trump cites fake boob to make the case that support for impeachment is falling'

# Load the model and try

In [8]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [9]:
def run_pretrained_for_sentence(sent, len_sent = 25):
    tokens = tokenizer.tokenize(sent)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    padded_tokens = tokens + ['[PAD]' for _ in range(len_sent - len(tokens))]
    attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
    sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
    
    #Step 5: Get BERT vocabulary index for each token
    token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
    #Converting everything to torch tensors before feeding them to bert_model
    token_ids = torch.tensor(token_ids).unsqueeze(0) 
    attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
    #Feed them to bert
    hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask)
    
    return hidden_reps, cls_head

In [10]:
%timeit run_pretrained_for_sentence("Test this function")

74.5 ms ± 5.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Actually training model

Note: followed this tutorial https://medium.com/swlh/painless-fine-tuning-of-bert-in-pytorch-b91c14912caa

In [11]:
data.head()

Unnamed: 0,text,meanGrade,score_std,basic_score,scaled_mean
0,Trump cites fake boob to make the case that su...,1.166667,0.687184,0.0,0.388889
1,Louisiana school district : All students must ...,1.0,0.632456,0.0,0.333333
2,Green receives standing ovation at ‘ The Colo...,1.6,1.2,1.0,0.533333
3,Judge Orders State Department To Provide Withh...,1.0,0.632456,0.0,0.333333
4,ATT Loses Another 1.36 Million Pay TV Subscrib...,1.4,0.8,0.0,0.466667


In [12]:
class HumorDataset(Dataset):

    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Initialize the BERT tokenizer
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        label = self.df.loc[index, 'scaled_mean']
        
        #Preprocessing the text to be suitable for BERT
        #Tokenize the sentence
        tokens_orig = self.tokenizer.tokenize(sentence) 
        
        tokens = ['[CLS]'] + tokens_orig + ['[SEP]']
        
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [13]:
#Creating instances of training and validation set
train_set = HumorDataset(filename = 'data/created_datasets/humicroedit_unpaired_train.csv', maxlen = 30)
val_set = HumorDataset(filename = 'data/created_datasets/humicroedit_unpaired_valid.csv', maxlen = 30)
#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64)
val_loader = DataLoader(val_set, batch_size = 64)

In [14]:
print(train_set[0])

(tensor([  101,  8398, 17248,  8275, 22017,  2497,  2000,  2191,  1996,  2553,
         2008,  2490,  2005, 17727,  5243, 22729,  2003,  4634,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]), 0.3888888888888889)


In [15]:
# Training function
class HumorRegressor(nn.Module):

    def __init__(self, freeze_bert = True):
        super(HumorRegressor, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Regression layer
        self.fc1 = nn.Linear(768, 5)
        self.fc2 = nn.Linear(5, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        out = cont_reps[:, 0]
        
        #Feeding cls_rep to the regressor layer
        out = F.relu(self.fc1(out))
        out = self.fc2(out)
        
        #preds = self.cls_layer(cls_rep)

        return out

In [16]:
net = HumorRegressor(freeze_bert = True)

In [17]:
import torch.optim as optim

criterion = nn.MSELoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
for ep in range(2):
    loss_cum=0
    for it, (seq, attn_masks, labels) in enumerate(train_loader):
        #Clear gradients
        opti.zero_grad()  
        #Converting these to cuda tensors
        #seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)

        #Obtaining the logits from the model
        preds = net(seq, attn_masks)

        #Computing loss
        loss = criterion(preds.squeeze(-1), labels.float())
        loss_cum += loss

        #Backpropagating the gradients
        loss.backward()

        #Optimization step
        opti.step()

        if (it + 1) % 5 == 0:
            print("Iteration {} of epoch {} complete. Loss : {}".format(it+1, ep+1, 
                                                                    loss_cum.item()))
            loss_cum = 0
    loss_val = 0
    with torch.no_grad():
        for it, (seq, attn_masks, labels) in enumerate(val_loader):
            preds = net(seq, attn_masks)
            loss = criterion(preds.squeeze(-1), labels.float())
            loss_val += loss
            if (it + 1) % 10 == 0: print('Iteration ', it)
        print('MSE on validaiton: ', loss_val)  

In [None]:
loss_val = 0
with torch.no_grad():
    for it, (seq, attn_masks, labels) in enumerate(val_loader):
        preds = net(seq, attn_masks)
        loss = criterion(preds.squeeze(-1), labels.float())
        loss_val += loss
        if (it + 1) % 10 == 0: print('Iteration ', it)
print('MSE on validaiton: ', loss_val)       

# Look at examples from the validation dataset

In [214]:
valid_data = pd.read_csv('data/task-1/val_split.csv')
valid_data.head()

Unnamed: 0.1,Unnamed: 0,id,original,edit,grades,meanGrade
0,6000,3986,Trump Replacing Secretary of <State/> Tillerso...,Class,11000,0.4
1,6001,9504,When George W. Bush <stood/> with Hillary Clinton,knitted,11111,1.0
2,6002,13642,South Korea <hospital/> fire : dozens feared d...,camp,21000,0.6
3,6003,9371,<Trump/> predicts Patriots will win Super Bow...,gypsy,33200,1.6
4,6004,2947,Sessions announces new conditions for sanctuar...,launder,33210,1.8


In [215]:
def run_for_sentence(sentence, replacement, maxlen):
    sentence_change = replace_word(sentence, replacement)
    sentence = drop_replacement_symbols(sentence)

    #Preprocessing the text to be suitable for BERT
    tokens_orig = tokenizer.tokenize(sentence) #Tokenize the sentence
    tokens_new = tokenizer.tokenize(sentence_change)
    tokens = ['[CLS]'] + tokens_orig + ['[SEP]'] + tokens_new + ['[SEP]'] 
    if len(tokens) < maxlen:
        tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))] #Padding sentences
    else:
        tokens = tokens[:maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

    tokens_ids = tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
    tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

    #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
    attn_mask = (tokens_ids_tensor != 0).long()

    return tokens_ids_tensor, attn_mask

In [216]:
for i in range(0,10):
    (ids, attn) = run_for_sentence(valid_data.loc[i].original, valid_data.loc[i].edit, maxlen=50)
    with torch.no_grad():
        pred = net(ids.unsqueeze(0), attn.unsqueeze(0))
    print('Sentence: ', valid_data.loc[i].original)
    print('Alternative: ', replace_word(valid_data.loc[i].original, valid_data.loc[i].edit))
    print('Prediction is ', pred[0][0]*3, ' True value: ', valid_data.loc[i].meanGrade)

Sentence:  Trump Replacing Secretary of <State/> Tillerson With CIA Director Mike Pompeo : NPR
Alternative:  Trump Replacing Secretary of Class Tillerson With CIA Director Mike Pompeo : NPR
Prediction is  tensor(0.8303)  True value:  0.4
Sentence:  When George W. Bush <stood/> with Hillary Clinton
Alternative:  When George W. Bush knitted with Hillary Clinton
Prediction is  tensor(1.1504)  True value:  1.0
Sentence:  South Korea <hospital/> fire : dozens feared dead and many injured
Alternative:  South Korea camp fire : dozens feared dead and many injured
Prediction is  tensor(0.7940)  True value:  0.6
Sentence:   <Trump/> predicts Patriots will win Super Bowl by 8 points
Alternative:   gypsy predicts Patriots will win Super Bowl by 8 points
Prediction is  tensor(0.8824)  True value:  1.6
Sentence:  Sessions announces new conditions for sanctuary cities to <get/> federal money
Alternative:  Sessions announces new conditions for sanctuary cities to launder federal money
Prediction is  t