# Task definition
Implement LSTM Sentiment Tagger for imdb reviews dataset.

1. (5pt) Fill missing code below
    * 1pt implement vectorization
    * 2pt implement \_\_init\_\_ and forward methods of models
    * 2pt implement collate function
2. (4pt) Implement training loop, choose proper loss function, use clear ml for max points.
    * 2pts is a baseline for well written, working code
    * 2pts if clear ml used properly
3. (3pt) Train the models (find proper hyperparams). Make sure you are not overfitting or underfitting. Visualize training of your best model (plot training, and test loss/accuracy in time). Your model should reach at least 87% accuracy. For max points it should exceed 89%. 
    * 1pt for accuracy above 89%
    * 1pt for accuracy above 87%
    * 1pt for visualizations

Remarks:
* Use embeddings of size 50
* Use 0.5 threshold when computing accuracy.
* Use supplied dataset for training and evaluation.
* You do not have to use validation set.
* You should monitor overfitting during training.
* For max points use clear ml to store and manage logs from your experiments. 
* We encourage to use pytorch lightning library (Addtional point for using it - however the sum must not exceed 12)

[Clear ML documentation](https://clear.ml/docs/latest/docs/)

[Clear ML notebook exercise from bootcamp](https://colab.research.google.com/drive/1wtLb4gg8beLS7smcyJlOZppn6_rQvSxL?usp=sharing)

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import plotly
import plotly.express as px
#!pip install plotly --upgrade
plotly.__version__


'5.5.0'

In [5]:
from nltk.corpus import stopwords 
from collections import Counter
import string
import re

In [6]:
!pip install clearml

import os
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torchtext
from clearml import Task

import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset, DataLoader



In [7]:

web_server = 'https://app.community.clear.ml'
api_server = 'https://api.community.clear.ml'
files_server = 'https://files.community.clear.ml'
access_key = 'VL2DAD4BUIHHYAO57VXG'#@param {type:"string"}
secret_key = 'mYF9o68l0ixTF1elESQF0e5iAUnIcGTQ65S4II28onaMBZjRmk'#@param {type:"string"}

Task.set_credentials(web_host=web_server,
                     api_host=api_server,
                     files_host=files_server,
                     key=access_key,
                     secret=secret_key)

In [8]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1hK-3iiRPlbePb99Fe-34LJNZ5yB-nduq
!tar -xvzf imdb_dataset.gz
data = pd.read_csv("imdb_dataset.csv")

Downloading...
From: https://drive.google.com/uc?id=1hK-3iiRPlbePb99Fe-34LJNZ5yB-nduq
To: /content/imdb_dataset.gz
100% 77.0M/77.0M [00:00<00:00, 292MB/s]
imdb_dataset.csv


In [9]:
def token_cleaner(word):
    # Remove all non-word characters
    word = re.sub(r"[^\w\s]", '', word)
    # remove digits
    word = re.sub(r"\d", '', word)
    word = re.sub(r"\d", '', word)
    return word
unimportant_words = set(stopwords.words('english')) 

In [10]:
PADDING_VALUE = 0

class NaiveVectorizer:
    def __init__(self, tokenized_data, **kwargs):
        """Converts data from string to vector of ints that represent words. 
        Prepare lookup dict (self.wv) that maps token to int. Reserve index 0 for padding.
        """
        tokenized_data = [seq.split() for seq in tokenized_data]
        list_of_words = []
        for sequence in tokenized_data:
          for word in sequence:
            word = token_cleaner(word)
            if word not in unimportant_words and word != '':
              list_of_words.append(word)
        imdb_voc_counted = Counter(list_of_words)
        frequent_words = sorted(imdb_voc_counted,key=imdb_voc_counted.get,reverse=True)[:5000]
        imdb_dic = {w:i+1 for i,w in enumerate(frequent_words)}
        self.imdb_dic = imdb_dic
        ### Your code goes here ###


        ##################################

    def vectorize(self, tokenized_seq):
        """Converts sequence of tokens into sequence of indices.
        If the token does not appear in the vocabulary(self.wv) it is ommited
        Returns torch tensor of shape (seq_len,) and type long."""
        ### Your code goes here ###
        vectorized_sequence = []
        for word in tokenized_seq:
          if word in self.imdb_dic.keys():
            vectorized_sequence.append(self.imdb_dic[word])
        return torch.tensor(vectorized_sequence)




        ##################################

class ImdbDataset(Dataset):
    SPLIT_TYPES = ["train", "test", "unsup"]

    def __init__(self, data, preprocess_fn, split="train"):
        super(ImdbDataset, self).__init__()
        if split not in self.SPLIT_TYPES:
            raise AttributeError(f"No such split type: {split}")

        self.split = split
        self.label = [i for i, c in enumerate(data.columns) if c == "sentiment"][0]
        self.data_col = [i for i, c in enumerate(data.columns) if c == "tokenized"][0]
        self.data = data[data["split"] == self.split]
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.preprocess_fn(self.data.iloc[idx, self.data_col].split())
        label = self.data.iloc[idx, self.label]
        return (seq, label)

naive_vectorizer = NaiveVectorizer(data.loc[data["split"] == "train", "tokenized"])

def get_datasets():
    train_dataset = ImdbDataset(data, naive_vectorizer.vectorize)
    test_dataset = ImdbDataset(data, naive_vectorizer.vectorize, split="test")
        
    return train_dataset, test_dataset

def custom_collate_fn(pairs):
    """This function is supposed to be used by dataloader to prepare batches
    Input: list of tuples (sequence, label)
    Output: sequences_padded_to_the_same_lenths, original_lenghts_of_sequences, lables.
    torch.nn.utils.rnn.pad_sequence might be usefull here
    """
    ### Your code goes here ###
    seqcs = []
    lengths = []
    labels = []
    for i in pairs:
      if len(i[0]> 0) :
        seqcs.append(i[0])
        lengths.append(len(i[0]))
        labels.append(i[1])
  
    seqcs = torch.nn.utils.rnn.pad_sequence(seqcs)
    


    #################################
    return torch.transpose(seqcs,0 , 1), torch.tensor(lengths), torch.tensor(labels)

In [11]:
data_generator = get_datasets()

In [12]:
train_data = data_generator[0]
test_data = data_generator[1]

In [13]:
collated_data_train = custom_collate_fn(train_data)
collated_data_test = custom_collate_fn(test_data)


In [14]:
data_train = []
for i in range(len(collated_data_train[0])):
  L = list(collated_data_train[0][i])
  #L.reverse()
  L = torch.tensor(L)
  data_train.append(tuple([L, collated_data_train[2][i], collated_data_train[1][i]]))


In [15]:
data_test = []
for i in range(len(collated_data_test[0])):
  L = list(collated_data_test[0][i])
  #L.reverse()
  L = torch.tensor(L)
  data_test.append(tuple([L,  collated_data_test[2][i], collated_data_test[1][i]]))


In [16]:
batch_size = 64
train_loader = DataLoader(data_train, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(data_test, shuffle=True, batch_size=batch_size)

In [17]:
dataiter = iter(test_loader)
sample_x, sample_y, sample_length = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)
print('Sample input: \n', sample_length)

Sample input size:  torch.Size([64, 912])
Sample input: 
 tensor([[1706,   61,  474,  ...,    0,    0,    0],
        [2574,  443,  225,  ...,    0,    0,    0],
        [ 960,   48,   83,  ...,    0,    0,    0],
        ...,
        [1284, 1358,   18,  ...,    0,    0,    0],
        [2120,   40,  516,  ...,    0,    0,    0],
        [  61, 1314, 1236,  ...,    0,    0,    0]])
Sample input: 
 tensor([1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
        1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
        0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1.,
        0., 0., 0., 0., 0., 1., 0., 1., 1., 1.], dtype=torch.float64)
Sample input: 
 tensor([ 90,  87,  53,  39,  62,  31,  61,  75, 218,  20,  98, 148, 270,  53,
         50, 138,  66, 224,  62,  41, 361,  94,  17,  69,  43,  55,  72,  99,
         56, 140, 173,  57, 179, 152,  71, 101,  87, 128, 203,  56, 126, 135,
        199,  25,  35, 134,  67,  58,  5

In [18]:
"""Implement LSTMSentimentTagger. 
The model should use a LSTM module.
Use torch.nn.utils.rnn.pack_padded_sequence to optimize processing of sequences.
When computing vocab_size of embedding layer remeber that padding_symbol counts to the vocab.
Use sigmoid activation function.
"""
class LSTMSentimentTagger(nn.Module):
  def __init__(self, hidden_dim):
    super(LSTMSentimentTagger, self).__init__()
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(5001, 50)
    self.lstm = nn.LSTM(input_size=50, hidden_size = self.hidden_dim, num_layers = 3, 
                            batch_first=True, dropout = 0.5)
    self.fc = nn.Linear(self.hidden_dim, 1)
    self.sig = nn.Sigmoid()
    self.dropout = nn.Dropout(0.5)

        ### Your code goes here ###



        #################################

  def forward(self, sentence, lengths, original_lengths):
      batch_size = sentence.size(0)
      #print(sentence.shape)
      #print(sentence)
      #print(sentence.shape)
      #original_lengths, perm_idx = original_lengths.sort(0, descending=True)
      #sentence = sentence[perm_idx]
      embeds = self.embedding(sentence)
      #print(sentence)
      #print(embeds)
      #print(original_lengths)
      #print(embeds.shape)
      embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds, original_lengths.cpu().numpy(), batch_first=True, enforce_sorted= False)
      #print(embeds.data)
      #print(embeds.data.shape)
      output, lengths = self.lstm(embeds)
      #print(output.data)
      #print(output.data.shape)
      output, input_size = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
      #print(input_size)
      #print(output.data)
      #print(output.data.shape)
      output = torch.stack([output[i, original_lengths[i]-1, :] for i in range(len(sentence))])
      #print(out.shape)
      out = self.dropout(output) 
      out = self.fc(output)
      #print(out)
      sig_out = self.sig(out)
      #print(sig_out)
      #print(sig_out)
      scores = sig_out
      #print(sig_out)
      #print(scores)

        ### Your code goes here ###



        #################################
      return scores, lengths

  def init_hidden(self):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((3, 50, self.hidden_dim)).to(device)
        c0 = torch.zeros((3, 50, self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

# Trainig loop and visualizations


In [19]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [20]:
hidden_dim = 128
model = LSTMSentimentTagger(hidden_dim)
model.to(device)
lr=0.005
criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [21]:

config = {
    'num_layers': 3,
    'lr': 0.005,
    'hidden_dim': 128,
    'dropout_lstm': 0.5,
    'dropout': 0.5,

}
task = Task.create(project_name='Homework3', task_name='auto_log_experiment')

task.mark_started()
logger = task.get_logger()
task.connect(config)


epochs = 10
epoch_tr_loss = []
epoch_tr_acc = []

epoch_tst_loss = []
epoch_tst_acc = []

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    lengths = model.init_hidden()
    for inputs, labels,  original_lenghts in train_loader:
        #print(inputs.shape)
        inputs, labels,  original_lengths = inputs.to(device), labels.to(device), original_lenghts.to(device)  

        lengths = tuple([each.data for each in lengths])
        model.zero_grad()
        output, lengths = model(inputs, lengths, original_lenghts)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        print(loss.item())
        accuracy = acc(output,labels)
        train_acc += accuracy
        optimizer.step()
    test_lengths = model.init_hidden()
    test_losses = []
    test_acc = 0.0
    model.eval()
    for inputs, labels, test_original_lenghts in test_loader:
            test_lengths = tuple([each.data for each in test_lengths])

            inputs, labels, test_original_lenghts = inputs.to(device), labels.to(device), test_original_lenghts.to(device)
            output, test_lengths = model(inputs, test_lengths, test_original_lenghts)
            test_loss = criterion(output.squeeze(), labels.float())

            test_losses.append(test_loss.item())
            
            accuracy = acc(output,labels)
            test_acc += accuracy
    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_tr_acc.append(epoch_train_acc)

    epoch_test_loss = np.mean(test_losses)
    epoch_test_acc = test_acc/len(test_loader.dataset)
    epoch_tst_loss.append(epoch_test_loss)
    epoch_tst_acc.append(epoch_test_acc)

    logger.report_scalar(title='Train Loss', series='Train', iteration=epoch, value=epoch_train_loss)
    logger.report_scalar(title='Train Accuracy', series='Train', iteration=epoch, value=epoch_train_acc*100)
    logger.report_scalar(title='Test Accuracy', series='Test', iteration=epoch, value=epoch_test_acc*100)
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {epoch_train_loss} test_loss : {epoch_test_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} test_accuracy : {epoch_test_acc*100}')

task.mark_completed()
task.close()

0.6948374509811401
0.6826475858688354
0.7632557153701782
0.6907843351364136
0.6889647245407104
0.6905677318572998
0.6939468383789062
0.6905626654624939
0.6937167644500732
0.6922879219055176
0.6987571716308594
0.6960166096687317
0.6943176984786987
0.6942903995513916
0.696989893913269
0.6904969215393066
0.6821118593215942
0.6913180351257324
0.6935979723930359
0.6967881917953491
0.6742227077484131
0.6967160701751709
0.6900055408477783
0.7140141725540161
0.7024862766265869
0.6973346471786499
0.6868475079536438
0.6955761909484863
0.6891602277755737
0.6973952054977417
0.6879193186759949
0.6949561238288879
0.6849861145019531
0.6981842517852783
0.6880589723587036
0.688610315322876
0.6855719089508057
0.6893392205238342
0.689073920249939
0.6651556491851807
0.6664561629295349
0.606090784072876
0.6464054584503174
0.6779998540878296
0.7010142803192139
0.6973937749862671
0.688307523727417
0.6863394975662231
0.6822629570960999
0.6799675822257996
0.6876497268676758
0.6713131666183472
0.66993248462677


In [23]:
import plotly
import plotly.express as px

df = pd.DataFrame(epoch_tst_loss, columns = ['Test loss'])
df['epoch'] = [1,2,3,4,5,6,7,8,9,10]
df['Train loss'] = epoch_tr_loss
fig = px.line(df, x="epoch", y=["Test loss", 'Train loss'], title='Train and test loss comparison')
fig.show()

In [24]:
df = pd.DataFrame(epoch_tst_acc, columns = ['Test accuracy'])
df['epoch'] = [1,2,3,4,5,6,7,8,9,10]
df['Train accuracy'] = epoch_tr_acc
fig = px.line(df, x="epoch", y=["Test accuracy", "Train accuracy"], title='Train and test accuracy comparison')
fig.show()