[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/Joint_Training_Movie_Review_Classification.ipynb)

In [4]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # or use from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from itertools import islice

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

# **Dataset**
The dataset we use in this example is [SST2](https://nlp.stanford.edu/sentiment/index.html), which contains sentences from movie reviews, each labeled as either positive (has the value 1) or negative (has the value 0)

In [None]:
#downloading the dataset
!wget https://github.com/khetansarvesh/NLP/blob/main/Sentence-Level-Classification/SST_Dataset.csv

In [2]:
# reading the dataset
df = pd.read_csv("SST_Dataset.csv")
df.dropna(inplace=True)
df

Unnamed: 0,review,label
0,bromwell high is a cartoon comedy . it ran at ...,1
1,story of a man who has unnatural feelings for ...,0
2,homelessness or houselessness as george carli...,1
3,airport starts as a brand new luxury pla...,0
4,brilliant over acting by lesley ann warren . ...,1
...,...,...
24995,i saw descent last night at the stockholm fi...,0
24996,a christmas together actually came before my t...,1
24997,some films that you pick up for a pound turn o...,0
24998,working class romantic drama from director ma...,1


In [8]:
# finding vocabulary set in the entire data
all_text = ' '.join([sent for sent in df['review']])
words = all_text.split()
print(f'Total no of words present in the dataset : {len(words)}')
print(f'Some sample words present are : {words[0:10]}')

## Build a dictionary that maps words to integers
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}

print('')
print('Few Items of the Dictionary are : ')
for key, value in islice(vocab_to_int.items(), 10):
       print(f"{key} : {value}")

print('')
print(f"Dictionary Length : {len(vocab_to_int)}")

Total no of words present in the dataset : 6347388
Some sample words present are : ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at']

Few Items of the Dictionary are : 
the : 1
. : 2
and : 3
a : 4
of : 5
to : 6
is : 7
br : 8
it : 9
in : 10

Dictionary Length : 74073


cast : 56 means cast is a OHE vector where index 56 is 1 and rest all indexes have 0s & it is a 1*74073 dimension vector


In [9]:
# performing train test split
train_dataset = df.sample(frac=0.8,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("Full Dataset: {}".format(df.shape))
print("Train Dataset: {}".format(train_dataset.shape))
print("Test Dataset: {}".format(test_dataset.shape))

Full Dataset: (25000, 2)
Train Dataset: (20000, 2)
Test Dataset: (5000, 2)


In [10]:
class CustomDataset(Dataset):

    def __init__(self, df, vocab_to_int):
        self.df = df
        self.vocab_to_int = vocab_to_int

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # performing some preprocessing steps
        comment_text = str(self.df['review'][index]) # extracting the sentence at a particular index
        comment_text = comment_text.replace("</br>", " ") # replacing all <br/> tags with " "
        comment_text = "".join([char for char in comment_text if char not in string.punctuation and not char.isdigit()]) #remove punctuation and numbers
        comment_text = comment_text.lower() #lowering all the characters

        # skipping this step but you can add here : removing stopwords and performing stemming

        # converting text to numeric for [I love dogs] => [1, 76, 99]
        comment_num = [self.vocab_to_int[word] for word in comment_text.split()]


        '''
        As an additional pre-processing step, we want to make sure that our reviews are in good shape for standard processing.
        That is, our network will expect a standard input text size, and so, we'll want to shape our reviews into a specific length.
        Here we will choose this specific lenght to be 200. We'll approach this task in two main steps:
        '''
        # if the comment is too large then truncating it to size = 200
        if len(comment_num) > 200:
            comment_num = np.array(comment_num)[:200]

        # if the comment is too short then extending it to size = 200 via left padding (you can also do right padding instead)
        # eg : ['best', 'movie', 'ever'] => [117, 18, 128] => [0,0,0,....0,0,117,18,128]
        if len(comment_num) < 200:
            padding = [0] * (200 - len(comment_num))
            comment_num = np.array(padding + comment_num)

        return {'review': torch.tensor(comment_num, dtype=torch.long), 'label': torch.tensor(self.df['label'][index], dtype=torch.float)}


In [11]:
# training
training_set = CustomDataset(train_dataset, vocab_to_int)
training_loader = DataLoader(training_set, batch_size = 50, shuffle = True, num_workers = 0)

# testing
testing_set = CustomDataset(test_dataset, vocab_to_int)
testing_loader = DataLoader(testing_set, batch_size = 50, shuffle = True, num_workers = 0)

# **Modelling**


In [21]:
# Code for above architecture
class Sentiment_RNN(nn.Module):
    def __init__(self, vocab_size, device):
        super(Sentiment_RNN, self).__init__()
        self.device = device
        self.embedding = nn.Embedding(vocab_size, 400) # converts word tokens into OHE embeddings of size 400
        self.rnn = nn.RNN(input_size = 400,  hidden_size = 256, num_layers = 1, batch_first=True)
        self.fc = nn.Linear(256, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):

        batch_size = x.size(0)
        embeds = self.embedding(x) #ohe

        hidden = torch.zeros(1, x.size(0), 256).to(self.device)
        out, _ = self.rnn(embeds, hidden)
        out = out.contiguous().view(-1, 256) # stack up last hidden layer of lstm outputs denoted by -1
        out = self.sig(self.fc(out))
        out = out.view(batch_size, -1) # reshape to be batch_size first
        return out[:, -1] # get last batch of labels

# **Training**

In [22]:
model = Sentiment_RNN(vocab_size = len(vocab_to_int) + 1, device = device).to(device) # +1 for zero padding + our word tokens
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [23]:
model.train()
for e in range(20):

  for _,data in enumerate(training_loader,0):

    # zero accumulated gradients
    model.zero_grad()

    # predictions
    output = model(data['review'].to(device, dtype = torch.long))

    # actuals
    target = data['label'].to(device, dtype = torch.long)

    # calculate the loss and perform backprop
    loss = nn.BCELoss()(output.squeeze(), target.float())
    loss.backward()

    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(model.parameters(), 5) # gradient clipping - The maximum gradient value to clip at (to prevent exploding gradients).
    optimizer.step()

    # loss stats
    if _ % 100 == 0:
      print(f"Epoch: {e+1}/20...Loss: {loss.item()}...")

Epoch: 1/20...Loss: 0.7311912178993225...
Epoch: 1/20...Loss: 0.8274558782577515...
Epoch: 1/20...Loss: 0.7003806829452515...
Epoch: 1/20...Loss: 0.7026413083076477...
Epoch: 2/20...Loss: 0.6703734993934631...
Epoch: 2/20...Loss: 0.7358824014663696...
Epoch: 2/20...Loss: 0.5704975724220276...
Epoch: 2/20...Loss: 0.6489903330802917...
Epoch: 3/20...Loss: 0.5804699063301086...
Epoch: 3/20...Loss: 0.5636714100837708...
Epoch: 3/20...Loss: 0.6487445831298828...
Epoch: 3/20...Loss: 0.6269955039024353...
Epoch: 4/20...Loss: 0.5314719080924988...
Epoch: 4/20...Loss: 0.5224661827087402...
Epoch: 4/20...Loss: 0.5748705267906189...
Epoch: 4/20...Loss: 0.5312837958335876...
Epoch: 5/20...Loss: 0.5035082697868347...
Epoch: 5/20...Loss: 0.40971213579177856...
Epoch: 5/20...Loss: 0.5210629105567932...
Epoch: 5/20...Loss: 0.5348144173622131...
Epoch: 6/20...Loss: 0.5625311732292175...
Epoch: 6/20...Loss: 0.4933357834815979...
Epoch: 6/20...Loss: 0.35072243213653564...
Epoch: 6/20...Loss: 0.4630586206

# **Inference**



In [24]:
int_to_vocab = {value: key for key, value in vocab_to_int.items()}
int_to_vocab[0] = '<PAD>'

print('')
print('Few Items of the Reverse Dictionary are : ')
for key, value in islice(int_to_vocab.items(), 10):
       print(f"{key} : {value}")

print('')
print(f"Reverse Dictionary Length : {len(int_to_vocab)}")


Few Items of the Reverse Dictionary are : 
1 : the
2 : .
3 : and
4 : a
5 : of
6 : to
7 : is
8 : br
9 : it
10 : in

Reverse Dictionary Length : 74074


In [26]:
test_losses = []
num_correct = 0

model.eval()
for data in testing_loader:

    # predictions
    output = model(data['review'].to(device, dtype = torch.long))
    pred = torch.round(output.squeeze())  # convert output probabilities to predicted class (0 or 1)

    # actuals
    target = data['label'].to(device, dtype = torch.long)

    if num_correct == 0:
      review_ip = np.squeeze(data['review'][0].cpu().numpy())
      print(f"Input Sentence : { ' '.join([ int_to_vocab[i] for i in review_ip ]) }")
      print(f"Predicted : {pred[0]}")
      print(f"Actual : {target[0]}")

    # calculate loss
    test_loss = nn.BCELoss()(output.squeeze(), target.float())
    test_losses.append(test_loss.item())

    # compare predictions to true label
    correct_tensor = pred.eq(target.view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

Input Sentence : considering a star is born had been made twice already by the time the film came into production the latest remake has a freshness about it that can be attributed to the fantastic chemistry between the entire acting ensemble a viewer could be forgiven for believing that kris kristofferson barbara streisand were a couple off screen as well as on with their incredible displays of pure affection towards one another br br the film has been described in the past as a barbara streisand concert on film set to a soap opera storyline however for anyone that enjoys watching a film that takes you beyond the living room into a world where the characters seem truly alive a star is born is well worth the hiring price br br with its incredible soundtrack flawless acting and touching reality in regards to human emotions and the true frailty of life a star is born is a film that draws you into the world of esther hoffman the love of her life john norman howard br br a film for anyone t

In [27]:
print("Test loss: {:.3f}".format(np.mean(test_losses)))

Test loss: 0.758


In [28]:
test_acc = num_correct/len(testing_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test accuracy: 0.703
