# Read Me: 
The following was a project for CS6741. The goal was to build a pytorch bidirectional LSTM from scratch and achieve state of the art model accuracy for sentiment classification using SST2. This is a work-in-progress, with adjustments needed to achieve higher accuracy.

1. Import packages and mount drive
2. Bring in GloVe embeddings and seniment data
3. Create model
4. Set hyperparameters
5. Train and test model
6. Graph epochs

Accuracy is well below the ideal at only 82%. We are considering adjusting the model to add an additional dropout layer, and performing a more rigorous hyperparameter search.

# 1. Import packages and mount drive


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

drive_folder = "gdrive/My Drive/CS6741 Replication Project/" 
#drive_folder = "gdrive/My Drive/CS6741 - Topics in Natural Language Processing and Machine Learning/CS6741 Replication Project/"

Mounted at /content/gdrive


In [None]:
import torch

import torch.nn as nn
import torchtext
from torchtext import datasets
from torchtext.legacy import data
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import altair as alt
import spacy 
import math
import re

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
 
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)   

device=get_default_device()
device

device(type='cuda')

#2. Bring in GloVe embeddings and sentiment data

In [None]:
#Glove files
glove = torchtext.vocab.GloVe(name='6B',dim=300)
print(glove.vectors.shape)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.33MB/s]                           
100%|█████████▉| 399169/400000 [00:38<00:00, 10743.72it/s]

torch.Size([400000, 300])


In [None]:
#Sentiment files (2 Class from https://github.com/clairett/pytorch-sentiment-classification)
colnames=['review', 'sentiment'] 
train=pd.read_csv(drive_folder+"data/SST2/train.tsv", sep = '\t', names=colnames, header=None)
dev=pd.read_csv(drive_folder+"data/SST2/dev.tsv", sep = '\t', names=colnames, header=None)
test=pd.read_csv(drive_folder+"data/SST2/test.tsv", sep = '\t', names=colnames, header=None)

dev.head()

Unnamed: 0,review,sentiment
0,one long string of cliches,0
1,if you 've ever entertained the notion of doin...,0
2,k 19 exploits our substantial collective fear ...,0
3,it 's played in the most straight faced fashio...,0
4,"there is a fabric of complex ideas here , and ...",1


In [None]:
#Update preprocessing with our thoughts (pulled from )
contraction_dict = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
    "how's": "how is", "I'd": "I would", "I'd've": "I would have",
    "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
    "I've": "I have", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will",  "i'll've": "i will have", "i'm": "i am",
    "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
    "it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
    "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
    "she'll've": "she will have", "she's": "she is", "should've": "should have",
    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "this's": "this is", "that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

def clean_contractions(text, contraction_dict):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([contraction_dict[t] if t in contraction_dict else t for t in text.split(" ")])
    return text

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_dict = {
    "‘": "'",    "₹": "e",      "´": "'", "°": "",         "€": "e",
    "™": "tm",   "√": " sqrt ", "×": "x", "²": "2",        "—": "-",
    "–": "-",    "’": "'",      "_": "-", "`": "'",        '“': '"',
    '”': '"',    '“': '"',      "£": "e", '∞': 'infinity', 'θ': 'theta',
    '÷': '/',    'α': 'alpha',  '•': '.', 'à': 'a',        '−': '-',
    'β': 'beta', '∅': '',       '³': '3', 'π': 'pi'
}
def clean_special_chars(text, punct, punct_dict):
    for p in punct_dict:
        text = text.replace(p, punct_dict[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def preprocess_text(text, contraction_dict, punct, punct_dict):
    clean_text=text.lower()
    clean_text=clean_contractions(clean_text, contraction_dict)
    clean_text=clean_special_chars(clean_text, punct, punct_dict)
    clean_text=re.split('\W+', clean_text)
    #clean_text=[token for token in clean_text if token not in stopwords]  
    return " ".join(clean_text)

preprocess_text("samhdbei. 2345324@@# !~~~ sdne @ dsecwAADEk. SDKM",contraction_dict, punct, punct_dict)

nlp = spacy.load('en', disable=['parser','tagger','ner'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(preprocess_text(s,contraction_dict, punct, punct_dict))]

In [None]:
#Glove + SST --> tensor batch size x review length 

TEXT = torchtext.legacy.data.Field(tokenize = tokenizer)
LABEL = torchtext.legacy.data.LabelField(dtype = torch.float)

#reading again using tabular dataset
datafields=[('review', TEXT),('sentiment', LABEL)]
trn,val,tst=torchtext.legacy.data.TabularDataset.splits(path=drive_folder+"data/SST2/", train='train.tsv', validation='dev.tsv', test='test.tsv',format='tsv',skip_header=False, fields=datafields)

#including ony top 30000 words from vocab, building vocab for train data 
# extarcting these words from glove embeddings i.e. unique ids representing words should come from glove
# change to higher dimensionaity vector later
TEXT.build_vocab(trn,max_size=30000,vectors='glove.6B.300d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(trn)

#Fix Mappings (they are currently backwards, see next cell)
LABEL.vocab.stoi

#loop through trn and get a minibatch to work with - creates tensor X 
train_iterator,test_iterator, val_iterator=torchtext.legacy.data.BucketIterator.splits((trn,tst,val),batch_sizes=(10,10,10),sort_key =lambda x: len(x.review), sort_within_batch=False, device=device)
print(len(train_iterator))# train batches
print(len(val_iterator))# val batches

692
88


In [None]:
#Checking batch size
def show_batch(dl):
    for reviews, sentiments in dl:
        print(reviews.T.shape)
        print(sentiments.shape)
        print(sentiments)
        break
        
show_batch(train_iterator)

torch.Size([10, 20])
torch.Size([10])
tensor([0., 1., 1., 0., 1., 1., 0., 0., 0., 0.], device='cuda:0')


# 3. Create model

In [None]:
class BiLSTM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 output_dim, n_layers, bidirectional, batch_first, dropout):
        
        super(BiLSTM, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)

        self.embedding = nn.Embedding(vocab_size, embedding_dim) # convert sparse 1 hot encoded vectors to embeddings (glove embedding will be used here)
        
        #hidden_dim -> takes in 300 embedding gives 20 features (if we set hyperparamter hidden_dim = 20)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                            num_layers=n_layers, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim) # this is the W 

        
    def forward(self, text):

      embedded = self.embedding(text)

      embedded = self.dropout(embedded)

      _, (hidden, _) = self.lstm(embedded)

      #print('LSTM hidden shape', hidden.shape)

      #final hidden from left/right and concatenating
      hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)  #hidden shape before concatenation: 4x10x100

      output = self.fc(hidden) 
      #print('Linear output shape', output.shape)

      #return torch.log_softmax(output, dim=-1) #this is the final phi from class - HAS to be log_softmax
      return output

#4. Set hyperparameters & attach encodings

In [None]:
#Setting configurations and instantiating model
vocab_size = len(TEXT.vocab)
embedding_dim = 300
hidden_dim = 100 # Ask Sasha
output_dim =  2
n_layers = 2
bidirectional = True
dropout = 0.5
batch_first=True

model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, batch_first, dropout)
model.cuda()

#Optimizer 

#Option 1: SGD
#optimizer = torch.optim.SGD(model.parameters(), lr=0.0002) 

#Option 2: Adam 
lr=np.linspace(0.0001, 0.01, 10) #grid search (not instantiated yet) | bastings used 0.0002
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=10e-6) # Our choice for best loss

#Criterion
criterion = nn.CrossEntropyLoss()

In [None]:
#Attaching embeddings 
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

#print(model.embedding.weight.data)

torch.Size([13821, 300])


# 5. Train and Test Model

In [None]:
#Train
def train(model, iterator, criterion, optimizer):
    
  epoch_loss = 0

  model.train()

  for batch in iterator:
        
      optimizer.zero_grad()
        
      predictions = model(batch.review.T).squeeze(1)
      loss = criterion(predictions,batch.sentiment.long())

      #ORIGINAL
      #predictions = model(batch.review.T).squeeze(1)
      #loss = -torch.index_select(predictions, 1, batch.sentiment.long()).mean() #check this out with mean (see cross entropy pytorch - mean by default)
        
      loss.backward()
 
      optimizer.step()
        
      epoch_loss += loss.item()

  return epoch_loss

 

In [None]:
#Validation

def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:

            #Doing the predictions  
            predictions = model(batch.review.T).squeeze(1)
            
            #compute loss and accuracy
            loss = criterion(predictions,batch.sentiment.long())
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
#Comparing train/val in 10 epochs

N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss = train(model, train_iterator, criterion, optimizer)
    
    #evaluate the model
    valid_loss = evaluate(model, val_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pth')
    
    print(f'\tTrain Loss: {train_loss:.3f}%')
    print(f'\t Val. Loss: {valid_loss:.3f}%')

	Train Loss: 374.843%
	 Val. Loss: 0.416%
	Train Loss: 229.723%
	 Val. Loss: 0.418%
	Train Loss: 121.617%
	 Val. Loss: 0.497%
	Train Loss: 63.668%
	 Val. Loss: 0.642%
	Train Loss: 40.980%
	 Val. Loss: 0.716%
	Train Loss: 31.661%
	 Val. Loss: 0.879%
	Train Loss: 29.106%
	 Val. Loss: 1.018%
	Train Loss: 25.492%
	 Val. Loss: 0.758%
	Train Loss: 16.742%
	 Val. Loss: 1.035%
	Train Loss: 18.389%
	 Val. Loss: 1.208%


In [None]:
#Test
with torch.no_grad():
  acc=0
  acc_score=0
  iterator_len=0
  for batch in test_iterator:
    pred = model(batch.review.T)
    y_hat = torch.sigmoid(pred) #Turn into probabilities
    #print('original',y_hat)
    y_hat = torch.argmax(y_hat,dim=1)
    #print('argmaxed',y_hat)
    #print(batch.sentiment)
    #y_hat=torch.where(pred[:,0]>0.5, 1, 0) # Changing labels doesn't change accuracy?
    #y_hat=torch.where(pred[:,0]>math.log(0.5), 1, 0)
    acc=torch.where(y_hat==batch.sentiment, 1, 0).sum()
    acc_score = acc_score + acc

acc_score=100*acc_score/(10*len(test_iterator))

In [None]:
print(acc_score)
# 78% with reverse labels?

tensor(82.0765, device='cuda:0')


# 6. Graph epochs

In [None]:
#Graphing epochs
data=[]
epoch=[]
loss=[]
epoch=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
loss=[385, 233, 125, 63, 38, 28, 29, 24, 21, 17]
data=zip(epoch, loss)
df=pd.DataFrame(data, columns=("epoch", "loss"))
df.to_latex()

In [None]:
chart = (alt.Chart(df)
  .mark_line()
  .properties(title="Loss Across Epochs")
  .encode(x="epoch", y="loss"))
chart