In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

#drive_folder = "gdrive/My Drive/CS6741 Replication Project/" 
drive_folder = "gdrive/My Drive/CS6741 - Topics in Natural Language Processing and Machine Learning/CS6741 Replication Project/"

Mounted at /content/gdrive


In [2]:
import torch
import torch.nn as nn
import torchtext
from torchtext import datasets
from torchtext.legacy import data
import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import spacy 
import math

In [3]:
#@Glove files
glove = torchtext.vocab.GloVe(name='6B',dim=300)

print(glove.vectors.shape)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                          
100%|█████████▉| 399119/400000 [00:38<00:00, 10621.06it/s]

torch.Size([400000, 300])


In [4]:
#@Sentiment files (2 Class from https://github.com/clairett/pytorch-sentiment-classification)
colnames=['review', 'sentiment'] 
train=pd.read_csv(drive_folder+"data/SST2/train.tsv", sep = '\t', names=colnames, header=None)
dev=pd.read_csv(drive_folder+"data/SST2/dev.tsv", sep = '\t', names=colnames, header=None)
test=pd.read_csv(drive_folder+"data/SST2/test.tsv", sep = '\t', names=colnames, header=None)

dev.head()

Unnamed: 0,review,sentiment
0,one long string of cliches,0
1,if you 've ever entertained the notion of doin...,0
2,k 19 exploits our substantial collective fear ...,0
3,it 's played in the most straight faced fashio...,0
4,"there is a fabric of complex ideas here , and ...",1


In [6]:
#@Update preprocessing with our thoughts
contraction_dict = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
    "how's": "how is", "I'd": "I would", "I'd've": "I would have",
    "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
    "I've": "I have", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will",  "i'll've": "i will have", "i'm": "i am",
    "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
    "it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
    "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
    "she'll've": "she will have", "she's": "she is", "should've": "should have",
    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "this's": "this is", "that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

def clean_contractions(text, contraction_dict):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([contraction_dict[t] if t in contraction_dict else t for t in text.split(" ")])
    return text

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_dict = {
    "‘": "'",    "₹": "e",      "´": "'", "°": "",         "€": "e",
    "™": "tm",   "√": " sqrt ", "×": "x", "²": "2",        "—": "-",
    "–": "-",    "’": "'",      "_": "-", "`": "'",        '“': '"',
    '”': '"',    '“': '"',      "£": "e", '∞': 'infinity', 'θ': 'theta',
    '÷': '/',    'α': 'alpha',  '•': '.', 'à': 'a',        '−': '-',
    'β': 'beta', '∅': '',       '³': '3', 'π': 'pi'
}
def clean_special_chars(text, punct, punct_dict):
    for p in punct_dict:
        text = text.replace(p, punct_dict[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

#@Fix Stop words
#import nltk
#stopwords = nltk.corpus.stopwords.words('english')

def preprocess_text(text, contraction_dict, punct, punct_dict):
    clean_text=text.lower()
    clean_text=clean_contractions(clean_text, contraction_dict)
    clean_text=clean_special_chars(clean_text, punct, punct_dict)
    clean_text=re.split('\W+', clean_text)
    #clean_text=[token for token in clean_text if token not in stopwords]  
    return " ".join(clean_text)

preprocess_text("samhdbei. 2345324@@# !~~~ sdne @ dsecwAADEk. SDKM",contraction_dict, punct, punct_dict)

nlp = spacy.load('en', disable=['parser','tagger','ner'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(preprocess_text(s,contraction_dict, punct, punct_dict))]

In [7]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
 
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)   

device=get_default_device()
device

device(type='cuda')

In [98]:
#Glove + SST --> tensor batch size x review length

TEXT = torchtext.legacy.data.Field(tokenize = tokenizer)
LABEL = torchtext.legacy.data.LabelField(dtype = torch.float)

#reading again using tabular dataset
datafields=[('review', TEXT),('sentiment', LABEL)]
trn,val,tst=torchtext.legacy.data.TabularDataset.splits(path=drive_folder+"data/SST2/", train='train.tsv', validation='dev.tsv', test='test.tsv',format='tsv',skip_header=False, fields=datafields)

#including ony top 30000 words from vocab, building vocab for train data 
TEXT.build_vocab(trn,max_size=30000,vectors='glove.6B.300d', unk_init=torch.Tensor.normal_)
# extarcting these words from glove embeddings i.e. unique ids representing words should come from glove
# for unkonwn or oov words, initialize them using normal distribution
# change to higher dimensionaity vector later
LABEL.build_vocab(trn)

#@Fix Mappings (they are currently backwards, see next cell)
LABEL.vocab.stoi

#loop through trn and get a minibatch to work with - creates tensor X 
train_iterator,test_iterator, val_iterator=torchtext.legacy.data.BucketIterator.splits((trn,tst,val),batch_sizes=(10,10,10),sort_key =lambda x: len(x.review), sort_within_batch=False, device=device)
print(len(train_iterator))# train batches
print(len(val_iterator))# val batches

692
88


In [9]:
LABEL.vocab.stoi

defaultdict(None, {'0': 1, '1': 0})

In [10]:
def show_batch(dl):
    for reviews, sentiments in dl:
        print(reviews.T.shape)
        print(sentiments.shape)
        print(sentiments)
        break
        
show_batch(train_iterator)

torch.Size([10, 33])
torch.Size([10])
tensor([0., 0., 1., 0., 1., 1., 0., 0., 1., 0.], device='cuda:0')


In [11]:
class BiLSTM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 output_dim, n_layers, bidirectional, batch_first, dropout):
        
        super(BiLSTM, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)

        self.embedding = nn.Embedding(vocab_size, embedding_dim) # convert sparse 1 hot encoded vectors to embeddings (glove embedding will be used here)
        
        #lstm -> takes in 300 embedding gives 20 features (if we set hyperparamter hidden_dim = 20)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                            num_layers=n_layers, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # this is the W 

        
    def forward(self, text):

      embedded = self.embedding(text)

      embedded = self.dropout(embedded)

      _, (hidden, _) = self.lstm(embedded)

      #print('LSTM hidden shape', hidden.shape)
      #now hidden is this x this
      #maybe: 2xbatchxlength

      #final hidden from left/right and concatenating
      #print(hidden) 
      hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)  #hidden shape before concatenation: 4x10x100
      #print(hidden)
      #print('SHAPE',hidden.shape)

      output = self.fc(hidden) 
      #print('Linear output shape', output.shape)

      #return torch.log_softmax(output, dim=-1) #this is the final phi from class - HAS to be log_softmax
      return output

In [99]:
#@Setting configurations and instantiating model
vocab_size = len(TEXT.vocab)
embedding_dim = 300
hidden_dim = 100 # Ask Sasha
output_dim =  2
n_layers = 2
bidirectional = True
dropout = 0.5
batch_first=True

#model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, batch_first) # dropout - set to 0 by default
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, batch_first, dropout)
model.cuda()

#optimizer = torch.optim.SGD(model.parameters(), lr=0.0002) # Based on class code
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0002, weight_decay=10e-6) # Based on Bastings et al. 2020
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=10e-6) # Our choice
criterion = nn.CrossEntropyLoss()

In [100]:
#Attaching embeddings 
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

#print(model.embedding.weight.data)

torch.Size([13821, 300])


In [93]:
#@Training the model
def train(model, iterator, criterion, optimizer, num_epochs=10):
    
  epoch_loss = 0

  model.train()

  for epoch in range(num_epochs):

    epoch_loss = 0

    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.review.T).squeeze(1)
        loss = criterion(predictions,batch.sentiment.long())

        #ORIGINAL (based partly on class code)
        #predictions = model(batch.review.T).squeeze(1)
        #loss = -torch.index_select(predictions, 1, batch.sentiment.long()).sum()
        
        loss.backward()

        #for name, param in model.named_parameters():
        #  print(name, param.grad.abs().sum())
        
        optimizer.step()
        
        epoch_loss += loss.item()

    print(f'| Epoch: {epoch+1:02} | Train Loss: {epoch_loss:.3f}') 

  return epoch_loss
 

In [101]:
#Printing epochs
train_loss = train(model, train_iterator, criterion, optimizer)

#Saving final epoch 
torch.save(model.state_dict(), "model.pth")

| Epoch: 01 | Train Loss: 372.918
| Epoch: 02 | Train Loss: 230.684
| Epoch: 03 | Train Loss: 126.144
| Epoch: 04 | Train Loss: 69.075
| Epoch: 05 | Train Loss: 41.777
| Epoch: 06 | Train Loss: 28.985
| Epoch: 07 | Train Loss: 22.475
| Epoch: 08 | Train Loss: 21.981
| Epoch: 09 | Train Loss: 23.826
| Epoch: 10 | Train Loss: 16.299


In [102]:
#@Testing the model

#Step 1: Instantiating Model
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [103]:
# Last questions:
# - Originally, BiLSTM output log_softmax (and manually implementing loss). Our understanding is that log_softmax provides the continuity for backward(). 
# - However, it must have been incorrectly implemented in terms of getting correct class from probabilities.
# - We made it work by taking out log_softmax and using nn.CrossEntropyLoss.

# - We decided to use sigmoid for getting prediction probabilities (though theoretically, we should have been able to use softmax), for ease of dealing with types.

# - Why is it that, even though we were confident in our class model (in terms of layers), the way we structured our output from our class model caused problems down the pipeline?
#   -(a) Why did manual cross entropy loss cause huge, fluctuating training loss magnitudes?
#   -(b) Why did this lead to very poor accuracy scores (~5%)? Which didn't change even if we switched labels?

# - Please explain labelling to us (particularly, can you confirm if they are switched, and whether the model is learning the switched versions or not)?
# - Our accuracy scores as is use the so-called "switched" labels, but the acc rate is ~78%, which implies that model is learning the switched version...

# - Was the model properly loaded (and is there another way that is preferable)?


with torch.no_grad():
  acc=0
  acc_score=0
  iterator_len=0
  for batch in test_iterator:
    pred = model(batch.review.T)
    y_hat = torch.sigmoid(pred) #Turn into probabilities
    print('original',y_hat)
    y_hat = torch.argmax(y_hat,dim=1)
    print('argmaxed',y_hat)
    #print(batch.sentiment)
    #y_hat=torch.where(pred[:,0]>0.5, 1, 0) # Changing labels doesn't change accuracy?
    #y_hat=torch.where(pred[:,0]>math.log(0.5), 1, 0)
    acc=torch.where(y_hat==batch.sentiment, 1, 0).sum()
    acc_score = acc_score + acc

acc_score=100*acc_score/(10*len(test_iterator))

original tensor([[0.1243, 0.8847],
        [0.1886, 0.8154],
        [0.9698, 0.0327],
        [0.3649, 0.6387],
        [0.7603, 0.2369],
        [0.2479, 0.7601],
        [0.0906, 0.9154],
        [0.0421, 0.9606],
        [0.0188, 0.9831],
        [0.0203, 0.9817]], device='cuda:0')
argmaxed tensor([1, 1, 0, 1, 0, 1, 1, 1, 1, 1], device='cuda:0')
original tensor([[0.8679, 0.1205],
        [0.9294, 0.0624],
        [0.8810, 0.1043],
        [0.4739, 0.5241],
        [0.5924, 0.3778],
        [0.7969, 0.1821],
        [0.4417, 0.5378],
        [0.0083, 0.9923],
        [0.8962, 0.0922],
        [0.1753, 0.8339]], device='cuda:0')
argmaxed tensor([0, 0, 0, 1, 0, 0, 1, 1, 0, 1], device='cuda:0')
original tensor([[0.1349, 0.8639],
        [0.9919, 0.0087],
        [0.9459, 0.0502],
        [0.9671, 0.0318],
        [0.9896, 0.0112],
        [0.0293, 0.9725],
        [0.9980, 0.0024],
        [0.0407, 0.9605],
        [0.1030, 0.8996],
        [0.6900, 0.3067]], device='cuda:0')
argmaxed 

In [104]:
print(acc_score)
# 78% with reverse labels?

tensor(78.4153, device='cuda:0')
