### Data

The sentiment analysis model within this notebook uses publicly available datasets:

1. Large Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/
2. Sentiment Polarity Dataset (v1 and v2): https://www.cs.cornell.edu/people/pabo/movie-review-data/
3. Rotten Tomatoes web scraped critic reviews: https://www.kaggle.com/datasets/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset/

All data is contained in the "data" directory (not committed in this repo).
The code below assumes the same directory structure as the original datasets under the root "data" folder.

In [None]:
import os
from dataset_utils import *

In [None]:
train_path = os.path.join(IMDB_DATA_PATH, "train")
test_path  = os.path.join(IMDB_DATA_PATH, "test")

#Main training set - Large Movie Review Dataset (IMDB)
((imdb_train_texts, imdb_train_labels), (imdb_test_texts, imdb_test_labels)) = load_imdb_dataset(train_path=train_path, test_path=test_path)

#Review Polarity Datasets - used as additional test data
(v1_texts, v1_labels), (v2_texts, v2_labels) = load_polarity(v1_path=POLARITY_v1_DATA_PATH, v2_path=POLARITY_v2_DATA_PATH)

#Rotten tomatoes critic dataset
((rotten_train_texts, rotten_train_labels), (rotten_test_texts, rotten_test_labels)) = load_rotten(ROTTEN_PATH)

In [None]:
import matplotlib.pyplot as plt

def plot_len_dist(dataset_name, texts):
    plt.hist([len(text) for text in texts], 50)
    plt.title(f"Length of sample over number of samples: {dataset_name}")
    plt.xlabel("Length of a sample")
    plt.ylabel("Number of samples")
    plt.show()

for title, texts in [('IMDB_train', imdb_train_texts), ('IMDB_test', imdb_test_texts), ('polarity_v1', v1_texts), ('polarity_v2',v2_texts),
                     ('rotten_train', rotten_train_texts), ('rotten_test', rotten_test_texts)]:
    plot_len_dist(title, texts)

In [None]:
for texts, labels, name in [(imdb_train_texts, imdb_train_labels, 'imdb_train'), (imdb_test_texts, imdb_test_labels, 'imdb_test'),
                            (v1_texts, v1_labels, 'polarity_v1'), (v2_texts, v2_labels, 'polarity_v2'),
                            (rotten_train_texts, rotten_train_labels, 'rotten tomatoes train'), 
                            (rotten_test_texts, rotten_test_labels, 'rotten tomatoes test')]:
    
    display_dataset_info(texts, labels, name)

In [None]:
# from itertools import chain

# train_texts_rot = list(chain(imdb_train_texts, rotten_train_texts))
# train_labels_rot = list(chain(imdb_train_labels, rotten_train_labels))

# test_texts_rot = list(chain(imdb_test_texts, rotten_test_texts))
# test_labels_rot = list(chain(imdb_test_labels, rotten_test_labels))


# random.seed(1)
# random.shuffle(train_texts_rot)
# random.shuffle(train_labels_rot)

# random.seed(1)
# random.shuffle(test_texts_rot)
# random.shuffle(test_labels_rot)

In [None]:
import torch

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif


MAX_FEATURES = 5000

transformer_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=8, ngram_range=(1,2))),
    ('select', SelectKBest(k=MAX_FEATURES))
])

transformed_train = transformer_pipeline.fit_transform(imdb_train_texts, imdb_train_labels)
transfomed_test  = transformer_pipeline.transform(imdb_test_texts)

X_train = torch.from_numpy(transformed_train.toarray()).to(dtype=torch.float32)
X_test  = torch.from_numpy(transfomed_test.toarray()).to(dtype=torch.float32)

X_test_v1 = torch.from_numpy(transformer_pipeline.transform(v1_texts).toarray()).to(dtype=torch.float32)
X_test_v2 = torch.from_numpy(transformer_pipeline.transform(v2_texts).toarray()).to(dtype=torch.float32)

In [None]:
y_train = torch.tensor(imdb_train_labels, dtype=torch.float32).unsqueeze(1)
y_test  = torch.tensor(imdb_test_labels, dtype=torch.float32).unsqueeze(1)

y_test_v1 = torch.tensor(v1_labels, dtype=torch.float32)
y_test_v2 = torch.tensor(v2_labels, dtype=torch.float32)

In [None]:
transformer_pipeline.named_steps

In [None]:
vectorizer = transformer_pipeline.named_steps['vect']
selector   = transformer_pipeline.named_steps['select']

In [None]:
from torch import nn

input_dim = X_train.shape[1]

class SentimentCLF(nn.Module):
    def __init__(self, n_hidden, n_units, dropout_p):
        super().__init__()
        
        self.linear_stack = nn.Sequential(nn.Linear(input_dim, n_units), nn.ReLU())
        
        for _ in range(n_hidden):
            self.linear_stack.append(nn.Linear(n_units, n_units))
            self.linear_stack.append(nn.ReLU())
        
        self.linear_stack.append(nn.Linear(n_units, 1))
                    
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x):
        dropped = self.dropout(x)
        logits = self.linear_stack(dropped)
        
        return logits

In [None]:
from skorch import NeuralNetClassifier

net = NeuralNetClassifier(
    module=SentimentCLF,
    module__n_units=500,
    module__dropout_p=0.6,
    module__n_hidden=2,
    lr = 0.0001,
    criterion=nn.BCEWithLogitsLoss,   
    device='cuda',
    max_epochs=15,
    optimizer=torch.optim.Adam,
    batch_size=64,
)

In [None]:
import pickle
# loading
with open('net1.pkl', 'rb') as f:
    model = pickle.load(f)


In [None]:
model.module_.linear_stack[0].weight

In [None]:
from sklearn.model_selection import GridSearchCV

net.set_params(train_split=False, verbose=0)

param_grid = {
    'module__dropout_p': [0.6],
    'module__n_units': [100, 500, 1000],
    'module__n_hidden': [1, 2, 3],
    'lr': [0.0001]
}

gs = GridSearchCV(net, param_grid=param_grid, refit=False, cv=3, scoring='accuracy', verbose=3)
gs.fit(X_train, y_train)
print(gs.best_score_, gs.best_params_)

In [None]:
net.set_params(**gs.best_params_)
net.fit(X_train, y_train)

In [None]:
import pickle
#with min_df = 5, ngram_range = (1,2) 
with open('net2.pkl', 'wb') as f:
    pickle.dump(net, f)

In [None]:
from sklearn.metrics import accuracy_score

for X, y in [(X_test, y_test), (X_test_v1, y_test_v1), (X_test_v2, y_test_v2)]:
    pred = model.predict(X)
    print(accuracy_score(pred, y))

## Next steps

1. Mix in more data to the training set. Maybe web scraping?
2. Grid search through more vectorizer options and inspect how it affects performance.
3. Package with argparse or sm