In [18]:
import os
from dataset_utils import *

train_path = os.path.join(IMDB_DATA_PATH, "train")
test_path  = os.path.join(IMDB_DATA_PATH, "test")

"""
For details on implementation of dataset loading and other utils check out dataset_utils.py

CONVENTION: label 0 --> negative review
            label 1 --> positive review
"""

#Main training set - Large Movie Review Dataset (IMDB)
((imdb_train_texts, imdb_train_labels), (imdb_test_texts, imdb_test_labels)) = load_imdb_dataset(train_path=train_path, test_path=test_path)

#Review Polarity Datasets
(v1_texts, v1_labels), (v2_texts, v2_labels) = load_polarity(v1_path=POLARITY_v1_DATA_PATH, v2_path=POLARITY_v2_DATA_PATH)

#Rotten tomatoes critic dataset
rotten_train_texts, rotten_train_labels, short_rotten_test, random_rotten_test, pos_rotten_test, neg_rotten_test = load_rotten_split(ROTTEN_PATH, n_train_samples=12000)

In [19]:
from sklearn.model_selection import train_test_split

train_texts_v1, test_texts_v1, train_labels_v1, test_labels_v1 = train_test_split(v1_texts, v1_labels, test_size=0.33, random_state=42)

train_texts = imdb_train_texts + rotten_train_texts + train_texts_v1
train_labels = imdb_train_labels + rotten_train_labels + train_labels_v1

seed=10
random.seed(seed)
random.shuffle(train_texts)
random.seed(seed)
random.shuffle(train_labels)

print(f"Training set size: {len(train_texts)}")

Training set size: 44143


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2), lowercase=True, max_features=50000)
vectorizer.fit(train_texts)

vocab = vectorizer.vocabulary_
remapped_vocab = {ngram: idx for idx, (ngram, _) in enumerate(sorted(vocab.items(),key=lambda x: x[1]))}
analyzer = vectorizer.build_analyzer()

In [21]:
import torch
from torch import nn
import torch.nn.functional as F

sequence_target_length = 350
vocab_size = len(remapped_vocab)
embedding_dim = 400
out_channels = 250
kernel_size = 4

class SentimentCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=out_channels, kernel_size=kernel_size)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        
        self.linear_stack = nn.Sequential(
            nn.Linear(in_features=out_channels, out_features=100),
            nn.ReLU(),
            nn.Linear(in_features=100, out_features=1)
        )
        self.fc1 = nn.Linear(in_features=out_channels, out_features=1)
    def forward(self, x):
        
        embed = self.embedding(x)
        embed = embed.permute(0,2,1)
        conved = self.conv(embed)
        pooled = self.pool(conved)
        pooled, _ = pooled.max(dim=2)
        return self.fc1(pooled)

In [22]:
def vectorize_text(text, analyzer, vocab):
    analyzed_list = analyzer(text)

    unigrams = [t for t in analyzed_list if len(t.split()) == 1]
    bigrams = [t for t in analyzed_list if len(t.split()) == 2]
    
    bigram_indices = [vocab.get(bigram) for bigram in bigrams]
    
    filled = [0 if idx is None else idx for idx in bigram_indices]
    
    return torch.tensor(filled)

In [23]:
# for idx in v:
#     if idx in remapped_vocab.values():
#         print(list(remapped_vocab.keys())[list(remapped_vocab.values()).index(idx)])

In [24]:
vectorized_train = [vectorize_text(text, analyzer=analyzer, vocab=remapped_vocab) for text in train_texts]

In [25]:
def pad_sequences(tensor_list, target_length, padding_value=0):
    padded_tensors = []
    
    for tensor in tensor_list:
        pad_length = target_length - len(tensor)
        padded_tensor = F.pad(tensor, (0, pad_length), value=padding_value)
        
        padded_tensors.append(padded_tensor.to(dtype=torch.int64))
        
    return torch.stack(padded_tensors)

In [26]:
padded_train = pad_sequences(vectorized_train, target_length=sequence_target_length)

In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using: {device}")

Using: cuda


In [28]:
from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, tensor_list, train_labels):
        self.X = tensor_list
        self.y = torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)

#train_dataset = TrainDataset(tensor_list=padded_train, train_labels=train_labels)

#train_dataloader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

In [29]:
y=torch.tensor(train_labels, dtype=torch.float32).unsqueeze(1)

In [30]:
from skorch import NeuralNetClassifier
from skorch.callbacks import Checkpoint, LoadInitState, LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau, LinearLR, StepLR, ExponentialLR

model_dirs = sorted(os.listdir('models'))

#top directory regards vectorizer settings
vect_dir_idx = 4
topdir = model_dirs[vect_dir_idx]

#experiments within chosen vectorizer settings directory
experiments = sorted(os.listdir(f'models/{topdir}'))

if len(experiments) == 0:
    curr_number = 1
else:
    curr_number = int(experiments[-1][-1])
    
next_number = curr_number + 1

dirname_curr=f'models/{topdir}/exp{curr_number}'
f_pickle_curr=f'exp{curr_number}.pkl'

dirname_next=f'models/{topdir}/exp{next_number}'
f_pickle_next=f'exp{next_number}.pkl'

#checkpoints for saving the model during training. cp_current corresponds to the last experiment.
#When we switch the callback to cp_next, this starts a new experiment

cp_current = Checkpoint(monitor='valid_loss_best', dirname=dirname_curr, f_pickle=f_pickle_curr)
cp_next    = Checkpoint(monitor='valid_loss_best', dirname=dirname_next, f_pickle=f_pickle_next)
load_state = LoadInitState(cp_current)

lr_scheduler = ('lr_scheduler', LRScheduler(policy=ReduceLROnPlateau, mode='min', factor=0.1, patience=5))
lr_sched_step = ('lr_scheduler', LRScheduler(policy=StepLR, step_size=10, gamma=0.05))
lr_sched_linear = ('lr_scheduler', LRScheduler(policy=LinearLR, start_factor=0.3, end_factor=1, total_iters=7))
#lr_sched_exp = ('lr_scheduler', LRScheduler(policy=ExponentialLR, g))

net = NeuralNetClassifier(
    module=SentimentCNN,
    lr = 0.001,
    criterion=nn.BCEWithLogitsLoss,   
    device=device,
    max_epochs=15,
    optimizer=torch.optim.Adam,
    #optimizer__momentum=0.99,
    #optimizer__weight_decay=0.001,
    batch_size=64,
    iterator_train__shuffle=True,
    callbacks = [cp_current, load_state],
    #callbacks = [cp_next],
    #callbacks=[cp_current]
)

net.fit(X=padded_train, y=y)

  epoch    train_loss    valid_acc    valid_loss    cp      dur
-------  ------------  -----------  ------------  ----  -------
      1        [36m0.6146[0m       [32m0.6975[0m        [35m0.5867[0m     +  22.2323
      2        [36m0.3999[0m       [32m0.7209[0m        0.5877        21.5989
      3        [36m0.1901[0m       [32m0.7286[0m        0.7116        21.6299
      4        [36m0.0830[0m       [32m0.7379[0m        0.7733        21.6816
      5        [36m0.0786[0m       [32m0.7428[0m        0.9660        21.6358
      6        [36m0.0466[0m       [32m0.7464[0m        0.9318        21.6135
      7        [36m0.0455[0m       0.7440        0.9228        21.6552
      8        [36m0.0452[0m       [32m0.7481[0m        0.9694        21.7613


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=SentimentCNN(
    (embedding): Embedding(50000, 400)
    (conv): Conv1d(400, 250, kernel_size=(4,), stride=(1,))
    (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear_stack): Sequential(
      (0): Linear(in_features=250, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=1, bias=True)
    )
    (fc1): Linear(in_features=250, out_features=1, bias=True)
  ),
)