In [None]:
import os, re, random

"""
The sentiment analysis model within this notebook uses publicly available datasets:

1. Large Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/
2. Sentiment Polarity Dataset (v1 and v2): https://www.cs.cornell.edu/people/pabo/movie-review-data/

All data is contained in the "data" directory (not committed in this repo).
The code below assumes the same directory structure as the original datasets under the root "data" folder.
"""

IMDB_DATA_PATH        = "data/aclImdb/"
POLARITY_v1_DATA_PATH = "data/rt-polaritydata/rt-polaritydata/"
POLARITY_v2_DATA_PATH = "data/review_polarity/txt_sentoken"

def load_inidv_dataset(set_path: str) -> tuple:
    texts = []
    labels = []
    
    for label in ['pos', 'neg']:
        cat_path = os.path.join(set_path, label)
        for file_name in os.listdir(cat_path):
            file_path = os.path.join(cat_path, file_name)
            with open(file_path, 'r') as file:
                text = file.read()
                texts.append(text)
            labels.append(0 if label=='neg' else 1)
            
    return (texts, labels)


def load_dataset(train_path: str, test_path: str, seed=1) -> tuple:
    train_texts = []
    train_labels = []
    
    (train_texts, train_labels) = load_inidv_dataset(set_path=train_path)
    (test_texts, test_labels)   = load_inidv_dataset(set_path=test_path)

    random.seed(seed)
    random.shuffle(train_texts)
    random.shuffle(test_texts)
    
    random.seed(seed)
    random.shuffle(train_labels)
    random.shuffle(test_labels)
    
    #remove html tags from the texts
    train_texts = [re.sub('<.*?>', '', text) for text in train_texts]
    test_texts  = [re.sub('<.*?>', '', text) for text in test_texts]
    
    return ((train_texts, train_labels), (test_texts, test_labels))


def load_polarity(v1_path, v2_path):
    
    v1_file_names = os.listdir(v1_path)
    
    v1_texts = []
    v1_labels = []
    
    for file_name in v1_file_names:
        file_path = os.path.join(v1_path, file_name)
        label = (0 if 'neg' in file_name else 1)

        with open(file_path, 'r', errors='ignore') as file:
            text = file.read()

            for snippet in text.splitlines():
                v1_texts.append(snippet)
                v1_labels.append(label)

   
    (v2_texts, v2_labels) = load_inidv_dataset(set_path=POLARITY_v2_DATA_PATH)
    
    return ((v1_texts, v1_labels), (v2_texts, v2_labels))

def get_smaller_dataset(size: int, texts: list[str], labels: list[int], seed=10) -> tuple:
    
    random.seed(seed)
    smaller_texts = random.sample(texts, size)
    random.seed(seed)
    smaller_labels = random.sample(labels, size)

    return (smaller_texts, smaller_labels)


train_path = os.path.join(IMDB_DATA_PATH, "train")
test_path  = os.path.join(IMDB_DATA_PATH, "test")

In [None]:
#Main training set - Large Movie Review Dataset (IMDB)
((train_texts, train_labels), (test_texts, test_labels)) = load_dataset(train_path=train_path, test_path=test_path)

#Review Polarity Datasets - used as additional test data
(v1_texts, v1_labels), (v2_texts, v2_labels) = load_polarity(v1_path=POLARITY_v1_DATA_PATH, v2_path=POLARITY_v2_DATA_PATH)

In [None]:
import numpy as np

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_FEATURES = 5000

random.seed()
seed = random.randint(1, 1000)

#(sm_texts, sm_labels) = get_smaller_dataset(12000, train_texts, train_labels, seed=seed)

#encoding largest dataset, training vectorizer and selector --> to be used for encoding other datasets
def get_ngram_dataset(train_texts, train_labels, test_texts, test_labels, min_df=5):
    vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=min_df) 
    
    X_train = vectorizer.fit_transform(train_texts)  
    selector = SelectKBest(score_func=f_classif, k=min(MAX_FEATURES, X_train.shape[1]))    

    y_train = np.array(train_labels)
    y_test = np.array(test_labels)
    
    X_train = selector.fit_transform(X_train, y_train).toarray()

    X_test = vectorizer.transform(test_texts)
    X_test = selector.transform(X_test).toarray()
    
    return ((X_train, y_train), (X_test, y_test), (vectorizer, selector))

#get the encoded data as well as the trained vectorizer and selector 
(X_train, y_train), (X_test, y_test), (vectorizer, selector) = get_ngram_dataset(
    train_texts, train_labels, test_texts, test_labels
)

In [None]:
def vectorize(texts, labels, vectorizer, selector):
    X = vectorizer.transform(texts)
    y = np.array(labels)

    X = selector.transform(X).toarray()
    
    return (X, y)

In [None]:
X_v1_test, y_v1_test = vectorize(v1_texts, v1_labels, vectorizer=vectorizer, selector=selector)
X_v2_test, y_v2_test = vectorize(v2_texts, v2_labels, vectorizer=vectorizer, selector=selector)

In [None]:
#placeholder benchmark naive bayes model
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm, y_sm, test_size=0.33, random_state=42)

clf = ComplementNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(accuracy_score(clf.predict(X_v1_test), y_v1_test))
print(accuracy_score(clf.predict(X_v2_test), y_v2_test))

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

#For now just wrapping already processed vectors
class NgramMovieDataset(Dataset):
    def __init__(self, X, y, device):        
        self.x = torch.from_numpy(X).to(device, dtype=torch.float32)
        self.y = torch.from_numpy(y).to(device, dtype=torch.float32)
            
    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])
    
    def __len__(self):
        return self.x.shape[0]


device = ("cuda" if torch.cuda.is_available() else "cpu")

def get_dataloader(X, y, device, batch_size=64, shuffle=True):
    dataset = NgramMovieDataset(X=X, y=y, device=device)
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)

# train_dataloader = get_dataloader(X=X_train, y=y_train, device=device)
# test_dataloader  = get_dataloader(X=X_test, y=y_test, device=device)

# v1_test_dataloader = get_dataloader(X=X_v1_test, y=y_v1_test, device=device)
# v2_test_dataloader = get_dataloader(X=X_v2_test, y=y_v2_test, device=device)

In [None]:
from torch import nn

input_dim = X_train.shape[1]

class SentimentCLF(nn.Module):
    def __init__(self, n_units):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(input_dim, n_units),
            nn.ReLU(),
            nn.Linear(n_units, n_units),
            nn.ReLU(),
            nn.Linear(n_units, 1)
        )
            
        self.dropout = nn.Dropout(p=0.2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        dropped = self.dropout(x)
        logits = self.linear_stack(dropped)
        
        return self.sigmoid(logits)  
    

def train(dataloader: DataLoader, model: nn.Module, loss_fn, optimizer):
    
    size = len(dataloader.dataset)
    
    for batch, (X, y) in enumerate(dataloader): 
        model.train()
        
        pred = model(X).squeeze()    
        loss = loss_fn(pred, y)
         
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
                   
                     
def test(dataloader: DataLoader, model: nn.Module, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X).squeeze()
            test_loss += loss_fn(pred, y).item()
            
            out_class = (pred > 0.5).float()      
            correct += (out_class==y).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
# model = SentimentCLF().to(device)
# loss_fn   = nn.BCELoss()
# optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

# epochs = 15
# for t in range(epochs):
#     print(f"Epoch {t+1}\n-------------------------------")
#     train(train_dataloader, model, loss_fn, optimizer)
#     #test(test_dataloader, model, loss_fn)
# print("Done!")

# for dataloader in [test_dataloader, v1_test_dataloader, v2_test_dataloader]:
#     test(dataloader=dataloader, model=model, loss_fn=loss_fn)

In [None]:
torch.cuda.empty_cache()

In [None]:
from skorch import NeuralNetClassifier

net = NeuralNetClassifier(
    module=SentimentCLF,
    module__n_units=512,
    lr = 0.001,
    criterion=nn.BCELoss,   
    device='cuda',
    max_epochs=15,
    optimizer=torch.optim.Adam,
    batch_size=64,
)

In [None]:
X_train_sk = torch.from_numpy(X_train).to(dtype=torch.float32)
y_train_sk = torch.from_numpy(y_train).to(dtype=torch.float32).unsqueeze(1)

In [None]:
net.initialize()
net.module_

In [None]:
net.fit(X_train_sk, y_train_sk)

In [None]:
X_test_sk = torch.from_numpy(X_test).to(device=device, dtype=torch.float32)
y_test_sk = torch.from_numpy(y_test).to(device='cpu', dtype=torch.float32)

pred = net.predict(X_test_sk)
accuracy_score(pred, y_test_sk)

In [None]:
X_test_1_sk = torch.from_numpy(X_v1_test).to(device=device, dtype=torch.float32)
accuracy_score(net.predict(X_test_1_sk), y_v1_test)

In [None]:
from sklearn.model_selection import GridSearchCV

# deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=0)
param_grid = {
    'lr': [0.001, 0.01],
    'max_epochs': [10, 15, 20],
    'module__n_units': [100, 500, 1000]
}
gs = GridSearchCV(net, param_grid=param_grid, refit=False, cv=3, scoring='accuracy', verbose=2)

gs.fit(X_train_sk, y_train_sk)
#print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

In [None]:
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

gs.best_params_