# Set up

## Mounting Drive to Access Data Files

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

#drive_folder = "gdrive/My Drive/CS6741 Replication Project/" # Katherine
drive_folder = "gdrive/My Drive/CS6741 - Topics in Natural Language Processing and Machine Learning/CS6741 Replication Project/" # Linda

Mounted at /content/gdrive


## Installing Packages and Setting Up GPU

In [2]:
import torch
import torch.nn as nn
import torchtext
from torchtext import datasets
from torchtext.legacy import data
import re
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import spacy 
import math
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
!pip install wandb -qqq
import wandb

[K     |████████████████████████████████| 2.1MB 7.9MB/s 
[K     |████████████████████████████████| 163kB 47.3MB/s 
[K     |████████████████████████████████| 133kB 33.9MB/s 
[K     |████████████████████████████████| 102kB 13.1MB/s 
[K     |████████████████████████████████| 71kB 10.2MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone


In [4]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
 
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)   

device=get_default_device()
device

device(type='cuda')

## Loading Data

In [5]:
#Glove files
glove = torchtext.vocab.GloVe(name='6B',dim=300)

print(glove.vectors.shape)

.vector_cache/glove.6B.zip: 862MB [02:49, 5.08MB/s]                           
100%|█████████▉| 399884/400000 [00:37<00:00, 10490.68it/s]

torch.Size([400000, 300])


In [6]:
#Sentiment files 
# 2 Class from https://github.com/clairett/pytorch-sentiment-classification
colnames=['review', 'sentiment'] 
train=pd.read_csv(drive_folder+"data/SST2/train.tsv", sep = '\t', names=colnames, header=None)
dev=pd.read_csv(drive_folder+"data/SST2/dev.tsv", sep = '\t', names=colnames, header=None)
test=pd.read_csv(drive_folder+"data/SST2/test.tsv", sep = '\t', names=colnames, header=None)

dev.head()

Unnamed: 0,review,sentiment
0,one long string of cliches,0
1,if you 've ever entertained the notion of doin...,0
2,k 19 exploits our substantial collective fear ...,0
3,it 's played in the most straight faced fashio...,0
4,"there is a fabric of complex ideas here , and ...",1


# Preprocessing

In [7]:
#Expanding contractions (pulled from other code)
contraction_dict = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
    "how's": "how is", "I'd": "I would", "I'd've": "I would have",
    "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
    "I've": "I have", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will",  "i'll've": "i will have", "i'm": "i am",
    "i've": "i have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
    "it's": "it is", "let's": "let us", "ma'am": "madam",
    "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
    "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
    "she'll've": "she will have", "she's": "she is", "should've": "should have",
    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "this's": "this is", "that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

def clean_contractions(text, contraction_dict):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([contraction_dict[t] if t in contraction_dict else t for t in text.split(" ")])
    return text

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_dict = {
    "‘": "'",    "₹": "e",      "´": "'", "°": "",         "€": "e",
    "™": "tm",   "√": " sqrt ", "×": "x", "²": "2",        "—": "-",
    "–": "-",    "’": "'",      "_": "-", "`": "'",        '“': '"',
    '”': '"',    '“': '"',      "£": "e", '∞': 'infinity', 'θ': 'theta',
    '÷': '/',    'α': 'alpha',  '•': '.', 'à': 'a',        '−': '-',
    'β': 'beta', '∅': '',       '³': '3', 'π': 'pi'
}
def clean_special_chars(text, punct, punct_dict):
    for p in punct_dict:
        text = text.replace(p, punct_dict[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that have to be dealt with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text


stopwords = nltk.corpus.stopwords.words('english')

def preprocess_text(text, contraction_dict, punct, punct_dict):
    clean_text=text.lower()
    clean_text=clean_contractions(clean_text, contraction_dict)
    clean_text=clean_special_chars(clean_text, punct, punct_dict)
    clean_text=re.split('\W+', clean_text)
    clean_text=[token for token in clean_text if token not in stopwords]  
    return " ".join(clean_text)

preprocess_text("samhdbei. 2345324@@# !~~~ sdne @ dsecwAADEk. SDKM",contraction_dict, punct, punct_dict)

nlp = spacy.load('en', disable=['parser','tagger','ner'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(preprocess_text(s,contraction_dict, punct, punct_dict))]

In [8]:
#Connecting GloVe and SST together --> tensor batch size x review length 

TEXT = torchtext.legacy.data.Field(tokenize = tokenizer)
LABEL = torchtext.legacy.data.LabelField(dtype = torch.float)

#Reading again using tabular dataset
datafields=[('review', TEXT),('sentiment', LABEL)]
trn,val,tst=torchtext.legacy.data.TabularDataset.splits(path=drive_folder+"data/SST2/", train='train.tsv', validation='dev.tsv', test='test.tsv',format='tsv',skip_header=False, fields=datafields)

#Including ony top 30000 words from vocab, building vocab for train data 
# Extracting these words from glove embeddings i.e. unique ids representing words should come from glove
TEXT.build_vocab(trn,max_size=30000,vectors='glove.6B.300d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(trn)

#Fix mappings (because they are currently backwards, see next cell)
LABEL.vocab.stoi

#Loop through trn and get a minibatch to work with 
train_iterator,test_iterator, val_iterator=torchtext.legacy.data.BucketIterator.splits((trn,tst,val),batch_sizes=(10,10,10),sort_key =lambda x: len(x.review), sort_within_batch=False, device=device)
print(len(train_iterator))# train batches
print(len(val_iterator))# val batches

692
88


In [9]:
LABEL.vocab.stoi

defaultdict(None, {'0': 1, '1': 0})

In [10]:
#Checking out our batches
def show_batch(dl):
    for reviews, sentiments in dl:
        print(reviews.T.shape)
        print(sentiments.shape)
        print(sentiments)
        break
        
show_batch(train_iterator)

torch.Size([10, 18])
torch.Size([10])
tensor([0., 0., 0., 0., 0., 1., 1., 0., 1., 0.], device='cuda:0')


# Building Model

In [11]:
class BiLSTM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 output_dim, n_layers, bidirectional, batch_first, dropout):
        
        super(BiLSTM, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)

        self.embedding = nn.Embedding(vocab_size, embedding_dim) # Convert sparse 1 hot encoded vectors to embeddings (glove embedding will be used here)
        
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                            num_layers=n_layers, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        
    def forward(self, text):

      embedded = self.embedding(text)

      embedded = self.dropout(embedded)

      _, (hidden, _) = self.lstm(embedded)

      #Final hidden from going left/right and concatenating
      hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)  # shape of hidden before concatenation: 4x10x100

      output = self.fc(hidden) 

      #Option 1
      # Uses cross entropy loss, which will apply softmax during training, as our criterion 
      return output

      #Option 2
      # Allows us to calculate cross entropy loss by hand (i.e. perform softmax here and then calculate mean loss during training)
      #return torch.log_softmax(output, dim=-1) #this allows us to calculate cross-entropy by hand by doing the softmax here and then calculating the mean loss during training



# Setting Parameters

In [126]:
#Setting configurations and instantiating model
vocab_size = len(TEXT.vocab)
embedding_dim = 300
hidden_dim = 100
output_dim =  2
n_layers = 2
bidirectional = True
dropout = 0.5
batch_first=True

model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, batch_first, dropout)
model.cuda()

BiLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(13688, 300)
  (lstm): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=2, bias=True)
)

In [131]:
#Optimizer 

#Option 1: SGD
#optimizer = torch.optim.SGD(model.parameters(), lr=0.0002) # Based on class code

#Option 2: Adam 
#lr=np.linspace(0.0001, 0.01, 10) # Grid search (not implemented yet)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0002, weight_decay=10e-6) # Based on Bastings et al. 2020
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=10e-6) # Our choice for best train loss (change hidden_dim to 100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=10e-6) # Our choice for best val loss

In [132]:
#Criterion
criterion = nn.CrossEntropyLoss()

In [133]:
#Attaching embeddings 
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

#print(model.embedding.weight.data)

torch.Size([13688, 300])


# Training Model

In [67]:
#Train
def train(model, iterator, criterion, optimizer):
    
  epoch_loss = 0

  model.train()

  for batch in iterator:
        
      optimizer.zero_grad()
        
      #Option 1: (see BiLSTM class for description)
      predictions = model(batch.review.T).squeeze(1)
      loss = criterion(predictions,batch.sentiment.long())

      #Option 2: (see BiLSTM class for description) 
      #predictions = model(batch.review.T).squeeze(1)
      #loss = -torch.index_select(predictions, 1, batch.sentiment.long()).mean() 
        
      loss.backward()
 
      optimizer.step()
        
      epoch_loss += loss.item()

  return epoch_loss

In [68]:
#Validation

def evaluate(model, iterator, criterion):
    
    #Initialize every epoch
    epoch_loss = 0

    #Deactivate dropout layers
    model.eval()
    
    #Deactivate autograd
    with torch.no_grad():
    
        for batch in iterator:

            #Do the predictions  
            predictions = model(batch.review.T).squeeze(1)
            
            #Compute loss and accuracy
            loss = criterion(predictions,batch.sentiment.long())
            
            #Keep track of loss and accuracy
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [134]:
#Comparing train/val in 10 epochs

N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #Train the model
    train_loss = train(model, train_iterator, criterion, optimizer)
    
    #Evaluate the model
    valid_loss = evaluate(model, val_iterator, criterion)
    
    #Save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pth')
    
    print(f'\tTrain Loss: {train_loss:.3f}%')
    print(f'\t Val. Loss: {valid_loss:.3f}%')

	Train Loss: 594.079%
	 Val. Loss: 0.710%
	Train Loss: 496.866%
	 Val. Loss: 0.635%
	Train Loss: 438.601%
	 Val. Loss: 0.590%
	Train Loss: 410.894%
	 Val. Loss: 0.560%
	Train Loss: 381.882%
	 Val. Loss: 0.541%
	Train Loss: 375.604%
	 Val. Loss: 0.529%
	Train Loss: 374.197%
	 Val. Loss: 0.519%
	Train Loss: 358.005%
	 Val. Loss: 0.513%
	Train Loss: 355.716%
	 Val. Loss: 0.508%
	Train Loss: 344.606%
	 Val. Loss: 0.504%


# Testing Model

In [135]:
#Test
with torch.no_grad():
  acc=0
  acc_score=0
  iterator_len=0
  for batch in test_iterator:
    pred = model(batch.review.T)
    y_hat = torch.sigmoid(pred) # Turn into probabilities
    y_hat = torch.argmax(y_hat,dim=1)
    acc=torch.where(y_hat==batch.sentiment, 1, 0).sum()
    acc_score = acc_score + acc

acc_score=100*acc_score/(10*len(test_iterator))

In [136]:
print(acc_score)

tensor(76.6667, device='cuda:0')


# Weights and Biases: Visualization of Loss

This is a duplication of the code that trains our model. It takes the section of code that runs through our training/validation loops and visualizes the loss over the number of epochs we specify.  

In [137]:
run = wandb.init(project="Replication",
            
           config={
               "epoch": 10,
           })

run.watch(model)

best_valid_loss = float('inf')

for epoch in range(run.config.epoch):
     
    #Train the model
    train_loss = train(model, train_iterator, criterion, optimizer)
    
    #Evaluate the model
    valid_loss = evaluate(model, val_iterator, criterion)
    
    #Save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pth')
    
    run.log(dict(loss=train_loss, epoch=epoch))

run.finish()
run

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


KeyboardInterrupt: ignored