[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/NLP/blob/main/unitask_downstream_nlp/Sentence-Level-Classification/Joint_Training_Movie_Review_Classification.ipynb)

In [34]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # or use from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

# **Dataset**
The dataset we use in this example is [SST2](https://nlp.stanford.edu/sentiment/index.html), which contains sentences from movie reviews, each labeled as either positive (has the value 1) or negative (has the value 0)

In [None]:
#downloading the dataset
!wget https://github.com/khetansarvesh/NLP/blob/main/Sentence-Level-Classification/SST_Dataset.csv

In [35]:
# reading the dataset
df = pd.read_csv("SST_Dataset.csv")
df.dropna(inplace=True)
df

Unnamed: 0,review,label
0,bromwell high is a cartoon comedy . it ran at ...,1
1,story of a man who has unnatural feelings for ...,0
2,homelessness or houselessness as george carli...,1
3,airport starts as a brand new luxury pla...,0
4,brilliant over acting by lesley ann warren . ...,1
...,...,...
24995,i saw descent last night at the stockholm fi...,0
24996,a christmas together actually came before my t...,1
24997,some films that you pick up for a pound turn o...,0
24998,working class romantic drama from director ma...,1


In [36]:
# finding vocabulary set in the entire data
all_text = ' '.join([sent for sent in df['review']])
words = all_text.split()
print(f'Total no of words present in the dataset : {len(words)}')
print(f'Some sample words present are : {words[0:10]}')

## Build a dictionary that maps words to integers
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)}
print(vocab_to_int)
len(vocab_to_int)

Total no of words present in the dataset : 6347388
Some sample words present are : ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at']


74073

cast : 56 means cast is a OHE vector where index 56 is 1 and rest all indexes have 0s & it is a 1*74073 dimension vector


In [37]:
# performing train test split
train_dataset = df.sample(frac=0.8,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("Full Dataset: {}".format(df.shape))
print("Train Dataset: {}".format(train_dataset.shape))
print("Test Dataset: {}".format(test_dataset.shape))

Full Dataset: (25000, 2)
Train Dataset: (20000, 2)
Test Dataset: (5000, 2)


In [73]:
class CustomDataset(Dataset):

    def __init__(self, df, vocab_to_int):
        self.df = df
        self.vocab_to_int = vocab_to_int

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # performing some preprocessing steps
        comment_text = str(self.df['review'][index]) # extracting the sentence at a particular index
        comment_text = comment_text.replace("</br>", " ") # replacing all <br/> tags with " "
        comment_text = "".join([char for char in comment_text if char not in string.punctuation and not char.isdigit()]) #remove punctuation and numbers
        comment_text = comment_text.lower() #lowering all the characters

        # skipping this step but you can add here : removing stopwords and performing stemming

        # converting text to numeric for [I love dogs] => [1, 76, 99]
        comment_num = [self.vocab_to_int[word] for word in comment_text.split()]


        '''
        As an additional pre-processing step, we want to make sure that our reviews are in good shape for standard processing.
        That is, our network will expect a standard input text size, and so, we'll want to shape our reviews into a specific length.
        Here we will choose this specific lenght to be 200. We'll approach this task in two main steps:
        '''
        # if the comment is too large then truncating it to size = 200
        if len(comment_num) > 200:
            comment_num = np.array(comment_num)[:200]

        # if the comment is too short then extending it to size = 200 via left padding (you can also do right padding instead)
        # eg : ['best', 'movie', 'ever'] => [117, 18, 128] => [0,0,0,....0,0,117,18,128]
        if len(comment_num) < 200:
            padding = [0] * (200 - len(comment_num))
            comment_num = np.array(padding + comment_num)

        return {'review': torch.tensor(comment_num, dtype=torch.long), 'label': torch.tensor(self.df['label'][index], dtype=torch.float)}


In [74]:
# training
training_set = CustomDataset(train_dataset, vocab_to_int)
training_loader = DataLoader(training_set, batch_size = 50, shuffle = True, num_workers = 0)

# testing
testing_set = CustomDataset(test_dataset, vocab_to_int)
testing_loader = DataLoader(testing_set, batch_size = 50, shuffle = True, num_workers = 0)

# **Modelling**


In [84]:
class Sentiment_Stacked_LSTM_RNN(nn.Module):

  def __init__(self, vocab_size, device):
    super(Sentiment_Stacked_LSTM_RNN, self).__init__()
    self.device = device
    self.embedding = nn.Embedding(vocab_size, 400) # converts word tokens into OHE embeddings of size 400
    self.rnn = nn.RNN(input_size = 400, hidden_size = 256, num_layers = 2, dropout=0.5, batch_first=True, bidirectional = False )
    self.fc = nn.Linear(256, 1)
    self.sig = nn.Sigmoid()

  def forward(self, x):
    batch_size = x.size(0)
    embeds = self.embedding(x) #ohe

    hidden = torch.zeros(2, x.size(0), 256).to(self.device)

    out, _ = self.rnn(embeds, hidden)
    out = out.contiguous().view(-1, 256) # stack up lstm outputs
    out = self.sig(self.fc(out))
    out = out.view(batch_size, -1) # reshape to be batch_size first
    return out[:, -1] # get last batch of labels


  def init_hidden(self, batch_size=50):
    # Create one new tensors with sizes n_layers x batch_size x hidden_dim,initialized to zero
    weight = next(self.parameters()).data
    return weight.new(2, batch_size, 256).zero_().to(device)


# **Training**

In [85]:
model = Sentiment_Stacked_LSTM_RNN(vocab_size = len(vocab_to_int) + 1, device = device).to(device) # +1 for zero padding + our word tokens
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [88]:
model.train()
for e in range(20):

  for _,data in enumerate(training_loader,0):

    # zero accumulated gradients
    model.zero_grad()

    # predictions
    output = model(data['review'].to(device, dtype = torch.long))

    # actuals
    target = data['label'].to(device, dtype = torch.long)

    # calculate the loss and perform backprop
    loss = nn.BCELoss()(output.squeeze(), target.float())
    loss.backward()

    # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(model.parameters(), 5) # gradient clipping - The maximum gradient value to clip at (to prevent exploding gradients).
    optimizer.step()

    # loss stats
    if _ % 100 == 0:
      print(f"Epoch: {e+1}/20...Loss: {loss.item()}...")

Epoch: 1/4...Loss: 0.47167059779167175...
Epoch: 1/4...Loss: 0.6024243235588074...
Epoch: 1/4...Loss: 0.46080267429351807...
Epoch: 1/4...Loss: 0.4530892074108124...
Epoch: 2/4...Loss: 0.4285443127155304...
Epoch: 2/4...Loss: 0.5140004754066467...
Epoch: 2/4...Loss: 0.3275683522224426...
Epoch: 2/4...Loss: 0.4391115605831146...
Epoch: 3/4...Loss: 0.45185285806655884...
Epoch: 3/4...Loss: 0.5645403265953064...
Epoch: 3/4...Loss: 0.32425788044929504...
Epoch: 3/4...Loss: 0.37441810965538025...
Epoch: 4/4...Loss: 0.24060837924480438...
Epoch: 4/4...Loss: 0.44985976815223694...
Epoch: 4/4...Loss: 0.47311240434646606...
Epoch: 4/4...Loss: 0.35741063952445984...
Epoch: 5/4...Loss: 0.38572928309440613...
Epoch: 5/4...Loss: 0.3600328862667084...
Epoch: 5/4...Loss: 0.4831101894378662...
Epoch: 5/4...Loss: 0.2978978157043457...
Epoch: 6/4...Loss: 0.4402378797531128...
Epoch: 6/4...Loss: 0.2810681462287903...
Epoch: 6/4...Loss: 0.36748987436294556...
Epoch: 6/4...Loss: 0.2686564326286316...
Epoch

# **Inference**



In [109]:
int_to_vocab = {value: key for key, value in vocab_to_int.items()}
int_to_vocab[0] = '<PAD>'
print(int_to_vocab)



In [113]:
test_losses = []
num_correct = 0

model.eval()
for data in testing_loader:

    # predictions
    output = model(data['review'].to(device, dtype = torch.long))
    pred = torch.round(output.squeeze())  # convert output probabilities to predicted class (0 or 1)

    # actuals
    target = data['label'].to(device, dtype = torch.long)

    if num_correct == 0:
      review_ip = np.squeeze(data['review'][0].cpu().numpy())
      print(f"Input Sentence : { ' '.join([ int_to_vocab[i] for i in review_ip ]) }")
      print(f"Predicted : {pred[0]}")
      print(f"Actual : {target[0]}")

    # calculate loss
    test_loss = nn.BCELoss()(output.squeeze(), target.float())
    test_losses.append(test_loss.item())

    # compare predictions to true label
    correct_tensor = pred.eq(target.view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

Input Sentence : <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> for those of you who think anime is just about giant reptiles raping schoolgirls think again there is a totally different side to the japanese animation yakitate japan is one of those shows it is a sweet natured tale of a young boy with the gift to make delicious bread his universe is all about creating a japanese bread that can match with the famous european breads the show is as wacky as they come and i m sure that non japanese viewers will miss a lot of the jokes but it is still very nice to watch because of the complete innocent vibe of the show br br in the world of yakitate it is not uncommon for people to look like they ve just had an orgasm after eating bread the bread is hallucinating and can give the consumer a wide array of super powers from time traveling to swimming like a fish that weird aspect makes it into

In [114]:
print("Test loss: {:.3f}".format(np.mean(test_losses)))

Test loss: 1.050


In [115]:
test_acc = num_correct/len(testing_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test accuracy: 0.677
