# IMDB Training Steps

## Dataset Splitting

In [3]:
data_folder = '/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'
df = pd.read_csv(data_folder)

from sklearn.model_selection import train_test_split

# Create a mapping dictionary
label_mapping = {'positive': 1, 'negative': 0}

# Convert labels using the mapping dictionary
df['sentiment'] = df['sentiment'].map(label_mapping)

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=1234)


# Example usage
print("Train set size:", len(train_df))
print("Test set size:", len(test_df))



Train set size: 25000
Test set size: 25000


In [4]:
train_df.head()

Unnamed: 0,review,sentiment
23420,...And that's why hard to rate. <br /><br />Fr...,0
43821,Some have praised _Atlantis:_The_Lost_Empire_ ...,0
21387,This film says everything there is to say abou...,1
17127,"Last time I checked, the Nazis didn't win the ...",0
3642,I wish Depardieu had been able to finish his b...,0


## Data preprocessing 

In [5]:
from nltk.corpus import stopwords
import nltk 
stop_words = set(stopwords.words('english'))

In [17]:
import re
import string
from string import digits
from collections import Counter
from torchtext.data.utils import get_tokenizer

# Define the tokenizer
tokenizer = get_tokenizer('basic_english')


def stringprocess(text):
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\#", "", text)
    text = re.sub(r"http\S+","URL", text)
    text = re.sub(r"@", "", text)
    text = re.sub(r"[^A-Za-z0-9()!?\'\`\"]", " ", text)
    text = re.sub("\s{2,}", " ", text)
    text = text.strip(' ')
    text = text.lower()
    
    return text

def tokenprocess(text):
    text_tokens = tokenizer(text)
    # Filter tokens based on their frequency
    filtered_tokens = [token for token in text_tokens if token not in stop_words]
    return filtered_tokens


## Define vocabulary from text

In [18]:
import matplotlib.pyplot as plt



X = df["review"]

X = X.apply(stringprocess)
word_tokens = list(X.apply(tokenprocess))

word_tokens_flat = [item for sublist in word_tokens for item in sublist]

# Collect unique tokens from the dataset
vocab = set()
for data_point in word_tokens:
    vocab.update(data_point)

# Step 1: Determine word frequencies
word_frequency = {}
for word in word_tokens_flat:
    if word in word_frequency:
        word_frequency[word] += 1
    else:
        word_frequency[word] = 1

# Step 2: Define threshold frequency
threshold = 4

# Step 3: Create filtered list
vocab = [word for word in vocab if word_frequency[word] >= threshold]

# Convert the set of unique tokens to a list
vocab = list(vocab)
vocab = ['<pad>'] + vocab 

print(len(vocab))

# Example usage: Print the vocabulary
print(vocab[:50])

# Count the number of tokens per data point
token_counts = []
for data_point in word_tokens:
    token_count = len(data_point)
    token_counts.append(token_count)

44127
['<pad>', 'dubbed', 'webcam', 'edgerton', 'incredible', 'unbelievably', 'virtual', 'coaches', 'inauspicious', 'mothering', 'hg', 'bourdelle', 'wackos', 'stamina', 'mood', 'career', 'eyecandy', 'grins', 'goffin', 'prejudiced', 'absences', 'knb', 'irreconcilable', 'rescuer', 'hushed', 'maltese', 'modes', 'wield', 'hoey', 'gulch', 'bulge', 'superdome', 'condensing', 'ij', 'fruity', 'indecisive', 'oiran', 'basing', 'nellie', 'propositions', 'gooey', 'masterful', 'cheesecake', 'neat', 'roxane', 'wobble', 'rewatched', 'loveable', 'tanisha', 'willaim']


In [19]:
len(vocab)

44127

In [20]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, vocab, max_length=500):
        self.data = df['review']
        self.targets = df['sentiment']
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.vocab_dict = {token: index for index, token in enumerate(vocab)} 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Get the data and target for the given index
        data_point = self.data.iloc[index]
        data_point = stringprocess(data_point)
        word_tokens = tokenprocess(data_point)
        target = self.targets.iloc[index]

        # Truncate the data point to the specified max length
        truncated_data = word_tokens[:self.max_length]
        data_ids = [self.vocab_dict[word] for word in truncated_data if self.vocab_dict.get(word) is not None] 

        return torch.tensor(data_ids), target

## Using a pretrained embedding 

Description of how GloVe works:

GloVe (for Global Vector) takes a word and turns it into a vector. The main
idea is that the distance between vectors that have close semantical meaning
should be small. Now two words can be considered to be close in some aspect
but very distant in some other. Consider for example the words 'men and
'woman'. These two words can be considered close because they both describe a
human being but can also be considered far from one another because they 
describe people of opposite sex. Therefore it seems that the right measure 
of semantical closedness should not be one-dimensional but instead multi-
dimensional. GloVe takes that into account and consider the right notion of distance
to be the difference between the two vectors: V(i) - V(j).

Let's define a co-occurence matrix whose entry X_{ij} gives the number of times
the j-word appears in the context window of i. Then Sum_j X_{ij} is the number
times any word apears in the context window of i. With these we can define a 
probability P(j|i) = X_{ij}/Sum_{j} X_{ij} which is the probability of finding 
the word j in the context window of i.

Now an observation is that the P(k|i)/P(k|j) is large if i is closely related to k
AND j is not related to k. For example, let's consider the words i = ice, 
j = steam and k = solid. Solid is related to ice but not to steam therefore in that
case P(k|i)/P(k|j) is large. Another example is i = ice, j = steam and k = water. In
that case water is both related to ice and steam and P(k|i)/P(k|j) will be of
order one.

The goal of the training is to obtain a set of vectors that satisfy the following
property:
    V(i).transpose.V(j) = logP(i|j)
Why? Because then, we can compute the projection of the distance in the direction
of the word k:
    V(k).transpose (V(i) - V(j)) = logP(k|i)/P(k|j)
When k is related to i but not to j, the argument of the log is large, when it's the 
opposite the argument is close to zero. Therefore in both those cases, the absolute
value of the log is large. Now if k is related to both i and j, the argument of the log
is close to one and the distance in that direction if small (think about the water
example). Note that if k = fashion, the distance in that direction will be small too
since fashion has nothing to do with both ice and steam.

In [21]:
from torchtext.vocab import GloVe
# # Load GloVe embeddings
# # Load a subset of GloVe embeddings
glove = GloVe(name='6B', dim=300)

# # Create a matrix to store GloVe embeddings
embedding_matrix = np.zeros((len(vocab), 300))

# # Fill the embedding matrix
for i, token in enumerate(vocab):
    embedding_matrix[i] = glove[token]

np.save('embeddings.npy', embedding_matrix)

In [22]:
embedding_matrix = np.load('/kaggle/working/embeddings.npy')

In [23]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import time

def collate_fn(batch):
    # Sort the batch in descending order of input sequence lengths
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    
    # Separate inputs and labels
    inputs, labels = zip(*batch)
    
    # Get the lengths of each input sequence
    input_lengths = [len(x) for x in inputs]
    
    # Pad the input sequences to the length of the longest sequence
    padded_inputs = pad_sequence(inputs, batch_first=True)

    return padded_inputs, torch.tensor(labels, dtype=torch.float32), input_lengths
    


train_dataset = CustomDataset(train_df, tokenizer, vocab)
test_dataset = CustomDataset(test_df, tokenizer, vocab)


# Create a DataLoader for batching and shuffling
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [24]:
 vocab.index('<pad>')

0

In [25]:
batch = next(iter(train_dataloader))
sentence, label, seq_lengths = batch
print(label)
# sentence = torch.tensor(sentence)
print(sentence.shape)

tensor([1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1.,
        1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.])
torch.Size([32, 320])


## Using LSTM

Since the IMDb dataset is not very big, we can use text representations that were pretrained on large-scale corpora to reduce overfitting. We will represent each otken using the pretrained GloVe model, and feed these token representations into a multilayer bidirectional LSTM to obtain the text sequence representation, which will be transformed into sentiment analysis outputs. 

In text classification tasks, a varying-length text sequence will be transformed into fixed-length categories. In the following BiLSTM class, while each token of a text sequence gets its individual pretrained GloVe representation via the embedding layer (`self.embedding`, the entire sequence is encoded by a directional LSTM (`self.encoder`). More concretely, the hidden states (at the last layer) of the bidirectional LSTM at both the initial and final time steps are concatenated as the representation of the text sequence. This single text representation is then transformed into output categories by a fully connected layer (`self.decoder`) with two outputs ("positive" and "negative").

In [49]:
def masked_softmax(attn_odds, masks) :
    attentions = torch.softmax(F.relu(attn_odds.squeeze()), dim=-1)
    # create mask based on the sentence lengths
   
    # apply mask and renormalize attention scores (weights)
    masked = attn_odds * masks
    _sums = masked.sum(-1).unsqueeze(-1)  # sums per row

    attn_odds = masked.div(_sums)
    return attn_odds

In [28]:
import torch.nn as nn

In [70]:
class TanhAttention(nn.Module):
    def __init__(self, hidden_size) :
        super().__init__()
        self.attn1 = nn.Linear(hidden_size, hidden_size // 2)
        self.attn2 = nn.Linear(hidden_size // 2, 1, bias=False)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    def forward(self, hidden, lengths):
        #input_seq = (B, L), hidden : (B, L, H), masks : (B, L)
        max_len = hidden.shape[1]
        attn1 = nn.Tanh()(self.attn1(hidden))
        attn2 = self.attn2(attn1).squeeze(-1)
        masks = torch.ones(attn2.size(), requires_grad=False).to(self.device)
        for i, l in enumerate(lengths):  # skip the first sentence
            if l < max_len:
                masks[i, l:] = 0
        
                
        attn = masked_softmax(attn2, masks)
        # apply attention weights
        weighted = torch.mul(hidden, attn.unsqueeze(-1).expand_as(hidden))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attn

In [71]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
class AttentionLSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        hidden_size,
        num_classes,
        dropout = 0.4,
        lstm_layer = 2

    ):
        super(AttentionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.dropout = nn.Dropout(p=dropout)

        self.lstm = nn.LSTM(input_size = emb_dim, hidden_size = hidden_size, bidirectional = True)
        self.attention = TanhAttention(hidden_size = hidden_size*2)
        self.fc1 = nn.Sequential(nn.Linear(hidden_size*lstm_layer, hidden_size*lstm_layer),
                                 nn.BatchNorm1d(hidden_size*lstm_layer),
                                 nn.ReLU())
        self.fc2 = nn.Linear(hidden_size*lstm_layer, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, x_len):
        x = self.embedding(x)
        x = self.dropout(x)
        x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False)
        out1, (h_n, c_n) = self.lstm(x)
        x, lengths = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
        x, _ = self.attention(x, lengths) # skip connect


        y = self.fc1(self.dropout(x))
        y = self.fc2(self.dropout(y))
        y = self.sigmoid(y.squeeze())
        return y

    def atten_forward(self, x, x_len):
        x = self.embedding(x)
        x = self.dropout(x)
        x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False)
        out1, (h_n, c_n) = self.lstm(x)
        x, lengths = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
        x, _ = self.attention(x, lengths) # skip connect
        return x

In [73]:
embed_size, num_hiddens, num_layers, device = 300, 128, 1, torch.device("cuda" if torch.cuda.is_available() else "cpu")

net = AttentionLSTM(
        vocab_size = len(vocab), 
        emb_dim = embed_size,
        hidden_size = num_hiddens,
        num_classes = 1,
        dropout = 0.4,
)

net.to(device)

def init_weights(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)
    if type(module) == nn.LSTM:
        for param in module._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(module._parameters[param])
net.apply(init_weights)

AttentionLSTM(
  (embedding): Embedding(44127, 300)
  (dropout): Dropout(p=0.4, inplace=False)
  (lstm): LSTM(300, 128, bidirectional=True)
  (attention): TanhAttention(
    (attn1): Linear(in_features=256, out_features=128, bias=True)
    (attn2): Linear(in_features=128, out_features=1, bias=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

Load GloVe embeddings 

In [74]:
net.embedding.weight.data.copy_(torch.tensor(embedding_matrix).to(device))
net.embedding.weight.requires_grad = False

lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.BCELoss()

In [76]:
import numpy as np
from torch.nn import functional as F

def compute_acc(preds, labels):
    correct = sum((preds>0.5) == labels)
    acc = float(correct) / float(len(labels.data)) * 100.0
    return acc

In [77]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [58]:
next(net.parameters()).is_cuda

True

## Train and Eval

In [78]:
def train(model, optimizer, num_epochs, train_dataloader, val_dataloader, device, loss, show_every, Bert=False):
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []


    # For each epoch...
    for epoch_i in range(0, num_epochs):

        store_train_loss = []
        store_train_acc = []
        store_val_loss = []
        store_val_acc = []

         # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0

        model.train()
    

        for i, batch in enumerate(train_dataloader):
            
            # Add batch to GPU
            temp_batch = []
            for t in batch:
                if isinstance(t, torch.Tensor):
                    t = t.to(device)
                temp_batch.append(t)
            batch = temp_batch
            if Bert: # if we're using the Bert model, see later 
                inputs_ids = batch[0]
                attention_masks = batch[1]
                labels = batch[2].squeeze()
            else:
                inputs_ids = batch[0]
                labels = batch[1].squeeze()
                seq_lengths = batch[2]
            
            optimizer.zero_grad()
            
            if Bert:
                # Perform a forward pass (evaluate the model on this training batch).
                outputs = model(inputs_ids, 
                            attention_mask=attention_masks)[0].squeeze()
            else:
                outputs = model(inputs_ids, seq_lengths)
                #print(outputs)
            outputs = outputs.squeeze()
            train_loss = loss(outputs, labels)
            train_acc = compute_acc(outputs, labels)
            
            store_train_loss.append(train_loss.item())
            store_train_acc.append(train_acc)
        
            train_loss.backward()
            optimizer.step()
        
            # Progress update every x batches.
            if i % show_every == 0 and not i == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {} / {}.'.format(i, len(train_dataloader)))
                print('Training loss: %.3f  Training acc: %.3f'%(np.mean(store_train_loss[-show_every:]), np.mean(store_train_acc[-show_every:])) ) 
                
        # compute epoch loss and accuracy 
        train_losses.append(np.mean(store_train_loss))
        train_accuracies.append(np.mean(store_train_acc))

        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))



        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()


        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Evaluate data for one epoch
        for batch in val_dataloader:

            # Add batch to GPU
            temp_batch = []
            for t in batch:
                if isinstance(t, torch.Tensor):
                    t = t.to(device)
                temp_batch.append(t)
            batch = temp_batch
                
            if Bert: # if we're using the Bert model, see later 
                inputs_ids = batch[0]
                attention_masks = batch[1]
                labels = batch[2].squeeze()
            else:
                inputs_ids = batch[0]
                labels = batch[1].squeeze()
                seq_lengths = batch[2]
            with torch.no_grad():

                if Bert:
                    # Perform a forward pass (evaluate the model on this training batch).
                    outputs = model(inputs_ids, 
                                attention_mask=attention_masks)[0].squeeze()
                else:
                    outputs = model(inputs_ids, seq_lengths)
            
                
            outputs = outputs.squeeze()    
            val_loss = loss(outputs, labels)
            val_acc = compute_acc(outputs, labels)

            store_val_loss.append(val_loss.item())
            store_val_acc.append(val_acc)
            

        # compute epoch loss and accuracy 
        mean_val_loss = np.mean(store_val_loss)
        val_losses.append(mean_val_loss)
        val_accuracies.append(np.mean(store_val_acc))

        # Report the final accuracy for this validation run.
        # Print loss and acc at the end of the epoch
        print("Epoch {}: Train Loss: {:.4f}, Validation Loss: {:.4f}, Train Accuracy: {:.2f}%, Validation Accuracy: {:.2f}%".format
        (epoch_i+1, train_losses[-1], val_losses[-1], train_accuracies[-1], val_accuracies[-1]))
        
    return train_losses, val_losses, train_accuracies, val_accuracies



In [79]:
train_losses, test_losses, train_accuracies, test_accuracies = train(net, optimizer, 8, train_dataloader, 
                                          test_dataloader, device, loss, show_every=200, Bert=False)



Training...
  Batch 200 / 782.
Training loss: 0.591  Training acc: 70.594
  Batch 400 / 782.
Training loss: 0.418  Training acc: 81.922
  Batch 600 / 782.
Training loss: 0.473  Training acc: 77.797
  Training epoch took: 0:00:43

Running Validation...
Epoch 1: Train Loss: 0.4769, Validation Loss: 0.3589, Train Accuracy: 77.96%, Validation Accuracy: 84.84%

Training...
  Batch 200 / 782.
Training loss: 0.395  Training acc: 82.781
  Batch 400 / 782.
Training loss: 0.384  Training acc: 84.094
  Batch 600 / 782.
Training loss: 0.402  Training acc: 82.984
  Training epoch took: 0:00:43

Running Validation...
Epoch 2: Train Loss: 0.3861, Validation Loss: 0.3268, Train Accuracy: 83.60%, Validation Accuracy: 86.19%

Training...
  Batch 200 / 782.
Training loss: 0.357  Training acc: 84.891
  Batch 400 / 782.
Training loss: 0.366  Training acc: 84.672
  Batch 600 / 782.
Training loss: 0.352  Training acc: 85.047
  Training epoch took: 0:00:42

Running Validation...
Epoch 3: Train Loss: 0.3558, 

In [80]:
torch.save(net.state_dict(), 'imdb_bilstm_tanh_attention_glove_300d.pt')

In [None]:
import matplotlib.pyplot as plt

def plot_loss_acc(train_losses, val_losses, train_accuracies, val_accuracies):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs[0].plot(train_losses, label='Train Loss')
    axs[0].plot(val_losses, label='Validation Loss')
    axs[0].set_title("Losses over Epochs")
    axs[0].set_xlabel("Epoch")
    axs[0].set_ylabel("Loss")
    axs[0].legend()
    
    axs[1].plot(train_accuracies, label='Train Accuracy')
    axs[1].plot(val_accuracies, label='Validation Accuracy')
    axs[1].set_title("Accuracies over Epochs")
    axs[1].set_xlabel("Epoch")
    axs[1].set_ylabel("Accuracy")
    axs[1].set_ylim((50,100))
    axs[1].legend()
    
    plt.tight_layout()
#     plt.savefig(path)

In [None]:
plot_loss_acc(train_losses, test_losses, train_accuracies, test_accuracies)

In [None]:
# def predict_sentiment(net, vocab, sequence):
#     sequence = tokenizer(sequence)
#     vocab_dict = {token: index for index, token in enumerate(vocab)}
#     sequence = [vocab_dict[word] for word in sequence] 
    
#     sequence = torch.tensor([sequence], device=device)
#     output = net(sequence)
#     return 'positive' if output > 0.5 else 'negative'

In [None]:
# predict_sentiment(net, vocab, 'I had a great time')