# IMDB sentiment analysis with RNNs

Kaggle: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from spellchecker import SpellChecker
from tqdm import tqdm
# allows to have a progress bar in pandas, useful for long processing operations
tqdm.pandas()
from collections import Counter
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

Read the dataset and observe the first 5 rows.

In [None]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Lucky us, the dataset is well-balanced.

In [None]:
data.sentiment.value_counts()

Transform the labels to 0 and 1.

In [None]:
def transform_label(label):
    return 1 if label == 'positive' else 0


data['label'] = data['sentiment'].progress_apply(transform_label)

## Preprocessing

- In classic NLP, the text is often preprocessed to remove tokens that might confuse the classifier
- Below you can find some examples of possible preprocessing techniques
- Feel free to modify them to improve the results of your classifier

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stopwords = set(stopwords.words('english'))

def rm_link(text):
    return re.sub(r'http\S+', '', text)


# handle case like "shut up okay?Im only 10 years old"
# become "shut up okay Im only 10 years old"
def rm_punct2(text):
    # return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)


def rm_html(text):
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    # remove <br /> tags
    return re.sub(r'<br />', '', text)


def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)  # add whitespaces between punctuation
    s = re.sub(r'\s{2,}', ' ', s)  # remove double whitespaces
    return s


def rm_number(text):
    return re.sub(r'\d+', '', text)


def rm_whitespaces(text):
    return re.sub(r'\s+', ' ', text)


def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)


def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)


def spell_correction(text):
    # if too slow: return text
    return text
    # https://pypi.org/project/pyspellchecker/
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            candidate = spell.correction(word)
            if candidate is not None:
                corrected_text.append(candidate)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    return ' '.join(corrected_text)

def clean_pipeline(text):
    text = text.lower()
    no_link = rm_link(text)
    no_html = rm_html(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = rm_punct2(space_punct)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    return spell_corrected

Let's clean the reviews first:

In [None]:
data['review'] = data['review'].progress_apply(clean_pipeline)

We now tokenize and remove stopwords (i.e. the, a, an, etc.) and lemmatize the words (i.e. running -> run, better -> good, etc.).

In [8]:
# preprocessing
def tokenize(text):
    return word_tokenize(text)


def rm_stopwords(text):
    return [i for i in text if i not in stopwords]


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains stopwords
    return rm_stopwords(lemmas)


def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [None]:
data['review'] = data['review'].progress_apply(preprocess_pipeline)

Let's check the result.

In [None]:
data.head()

## Embedding

- ANNs cannot process text input
- Input tokens must be mapped to integers using a vocabulary
- In this example, we build a vocabulary manually, but you can also replace this code with an [embedding layer](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)

In [11]:
# get all processed reviews
reviews = data.review.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()
# build vocabulary
counter = Counter(words)
# only keep top 2000 words
vocab = sorted(counter, key=counter.get, reverse=True)[:2000]
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}

In [None]:
reviews_enc = [[word2int[word] if word in word2int else word2int['<UNK>'] for word in review.split()] for review in tqdm(reviews, desc='encoding')]

Because we have to build batch, we have to pad the reviews to the same length. We will pad the reviews with <PAD> token.
**Because we use RNNs, we need to left pad and not right pad the sequence.**

In [13]:
# left padding sequences
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        start_index = max(0, seq_length - len(row))
        # if seq_length < len(row) then review will be trimmed
        features[i, start_index:] = np.array(row)[:min(seq_length, len(row))]

    return features


seq_length = 128
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)

## Split the data

In [14]:
labels = data.label.to_numpy()

# train test split
train_size = .75  # we will use 75% of whole data as train set
val_size = .5  # and we will use 50% of test set as validation set

# stratify will make sure that train and test set have same distribution of labels
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=1 - train_size, stratify=labels)

# split test set into validation and test set
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=val_size, stratify=test_y)

Define the datasets and dataloaders.

In [None]:
# define batch size
batch_size = 128

# create tensor datasets
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

Define the model.

In [16]:
# TODO

Instantiate the model.

In [17]:
# TODO

In [None]:
# TODO

Define the loss function and optimizer.

In [19]:
# TODO

Define the training loop.

In [None]:
# TODO

Evaluate the model on the test set.

In [None]:
# TODO