### Data

The sentiment analysis model within this notebook uses publicly available datasets:

1. Large Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/
2. Sentiment Polarity Dataset (v1 and v2): https://www.cs.cornell.edu/people/pabo/movie-review-data/
3. Rotten Tomatoes web scraped critic reviews: https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset/

All data is contained in the "data" directory (not committed in this repo).
The code below assumes the same directory structure as the original datasets under the root "data" folder.

In [8]:
import os
from dataset_utils import *

train_path = os.path.join(IMDB_DATA_PATH, "train")
test_path  = os.path.join(IMDB_DATA_PATH, "test")

#For details on implementation of dataset loading and other utils check out dataset_utils.py

#Main training set - Large Movie Review Dataset (IMDB)
((imdb_train_texts, imdb_train_labels), (imdb_test_texts, imdb_test_labels)) = load_imdb_dataset(train_path=train_path, test_path=test_path)

#Review Polarity Datasets
(v1_texts, v1_labels), (v2_texts, v2_labels) = load_polarity(v1_path=POLARITY_v1_DATA_PATH, v2_path=POLARITY_v2_DATA_PATH)

#Rotten tomatoes critic dataset
rotten_train_texts, rotten_train_labels, short_rotten_test, random_rotten_test = load_rotten_split(ROTTEN_PATH)

In [9]:
import matplotlib.pyplot as plt
import numpy as np

def plot_len_dist(dataset_name, texts):
    data=[len(text) for text in texts]
    median = np.median(data)
    plt.hist(data, 50)
    plt.axvline(x=median, color='red', linestyle='dashed', linewidth=2, label='Mean')
    plt.text(median + 0.5, 50, f'Median: {median:.2f}', color='red', fontsize=10)
    plt.title(f"Length of sample over number of samples: {dataset_name}")
    plt.xlabel("Length of a sample")
    plt.ylabel("Number of samples")
    plt.show()
    
def plot_wordcount_dist(dataset_name, texts):
    data=[len(text.split(" ")) for text in texts]
    median=np.median(data)
    plt.hist(data, 50)
    plt.axvline(x=median, color='red', linestyle='dashed', linewidth=2, label='Mean')
    plt.text(median + 0.5, 50, f'Median: {median:.2f}', color='red', fontsize=10)
    plt.title(f"Number of words: {dataset_name}")
    plt.xlabel("Number of words in a sample")
    plt.ylabel("Number of samples")
    plt.show()

# for title, texts in [('IMDB_train', imdb_train_texts), ('IMDB_test', imdb_test_texts), ('polarity_v1', v1_texts), ('polarity_v2',v2_texts),
#                      ('rotten_train', rotten_train_texts), ('rotten_test', rotten_test_texts)]:
#     plot_len_dist(title, texts)
#     plot_wordcount_dist(title, texts)
    

In [10]:
for texts, labels, name in [(imdb_train_texts, imdb_train_labels, 'imdb_train'), (imdb_test_texts, imdb_test_labels, 'imdb_test'),
                            (v1_texts, v1_labels, 'polarity_v1'), (v2_texts, v2_labels, 'polarity_v2'),
                            (rotten_train_texts, rotten_train_labels, 'rotten tomatoes train')]:
    
    display_dataset_info(texts, labels, name)

Dataset: imdb_train.
Total number of samples: 25000
Positive reviews total: 12500
Negative reviews total: 12500
Dataset: imdb_test.
Total number of samples: 25000
Positive reviews total: 12500
Negative reviews total: 12500
Dataset: polarity_v1.
Total number of samples: 10662
Positive reviews total: 5331
Negative reviews total: 5331
Dataset: polarity_v2.
Total number of samples: 2000
Positive reviews total: 1000
Negative reviews total: 1000
Dataset: rotten tomatoes train.
Total number of samples: 50000
Positive reviews total: 25000
Negative reviews total: 25000


In [11]:
from sklearn.model_selection import train_test_split

train_texts_v1, test_texts_v1, train_labels_v1, test_labels_v1 = train_test_split(v1_texts, v1_labels, test_size=0.33, random_state=42)

train_texts = imdb_train_texts + rotten_train_texts + train_texts_v1
train_labels = imdb_train_labels + rotten_train_labels + train_labels_v1

In [12]:
short_test_v1 = [(test_labels_v1[index], text) for index, text in enumerate(test_texts_v1) if len(text.split(" ")) < 25]
short_test = short_test_v1 + short_rotten_test

long_test_v2 = [(v2_labels[index], text) for index, text in enumerate(v2_texts) if len(text.split(" ")) > 300]
long_test_imdb = [(imdb_test_labels[index], text) for index, text in enumerate(imdb_test_texts) if len(text.split(" ")) > 300]

long_test = long_test_v2 + long_test_imdb

In [13]:
import torch

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif


MAX_FEATURES = 5000

transformer_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=8, ngram_range=(1,2))),
    ('select', SelectKBest(k=MAX_FEATURES))
])

transformed_train = transformer_pipeline.fit_transform(train_texts, train_labels)

X_train = torch.from_numpy(transformed_train.toarray()).to(dtype=torch.float32)
y_train = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)

vectorizer = transformer_pipeline.named_steps['vect']
selector   = transformer_pipeline.named_steps['select']

In [None]:
from torch import nn

input_dim = X_train.shape[1]

class SentimentCLF(nn.Module):
    def __init__(self, n_hidden, n_units, dropout_p):
        super().__init__()
        
        self.linear_stack = nn.Sequential(nn.Linear(input_dim, n_units), nn.ReLU())
        
        for _ in range(n_hidden):
            self.linear_stack.append(nn.Linear(n_units, n_units))
            self.linear_stack.append(nn.ReLU())
        
        self.linear_stack.append(nn.Linear(n_units, 1))
                    
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x):
        dropped = self.dropout(x)
        logits = self.linear_stack(dropped)
        
        return logits

In [None]:
from skorch import NeuralNetClassifier

net = NeuralNetClassifier(
    module=SentimentCLF,
    module__n_units=500,
    module__dropout_p=0.6,
    module__n_hidden=2,
    lr = 0.0001,
    criterion=nn.BCEWithLogitsLoss,   
    device='cuda',
    max_epochs=15,
    optimizer=torch.optim.Adam,
    batch_size=64,
    iterator_train__shuffle=True
)

In [None]:
from sklearn.model_selection import GridSearchCV

net.set_params(train_split=False, verbose=0)

param_grid = {
    'module__dropout_p': [0.6],
    'module__n_units': [100, 500, 1000],
    'module__n_hidden': [1, 2, 3],
    'lr': [0.0001]
}

gs = GridSearchCV(net, param_grid=param_grid, refit=False, cv=3, scoring='accuracy', verbose=3)
#gs.fit(X_train, y_train)
#print(gs.best_score_, gs.best_params_)

In [None]:
#net.set_params(**gs.best_params_)

In [None]:
net.fit(X_train, y_train)