In [16]:
#!mkdir ./data
#!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O ./data/dataset.tar.gz
#%cd data~
#!tar xvfz dataset.tar.gz
#%cd ..


# Train a neural net to predict sentiment using embeddings

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import glob

from gensim.utils import tokenize, deaccent, simple_preprocess
from collections import Counter
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
import time
import copy
import torch

In [2]:
positive_train = glob.glob("./data/aclImdb/train/pos/*.txt")
negative_train = glob.glob("./data/aclImdb/train/neg/*.txt")

positive_test = glob.glob("./data/aclImdb/test/pos/*.txt")
negative_test = glob.glob("./data/aclImdb/test/neg/*.txt")
len(negative_test), len(positive_test), len(positive_train), len(negative_train)

(12500, 12500, 12500, 12500)

In [3]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def __init__(self, files):
        self.files = files
            
    def __iter__(self):
        for file in self.files:
            
            text = open( file ).read().lower()
            
            yield simple_preprocess(text)


In [4]:
if False:
    model = Word2Vec( min_count=5, workers=5, size=200) 
    sentences = MyCorpus(positive_train + negative_train )
    model.build_vocab(sentences)
    sentences = MyCorpus(positive_test + positive_test)
    model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)


In [5]:
#model.save("./data/wv2.model")
model = Word2Vec.load( "./data/wv2.model" )

In [6]:
word2idx= { w:i for i, w in enumerate(model.wv.vocab.keys()) }
idx2word = {i:w for w,i in word2idx.items()}

len(word2idx)

28698

In [7]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    """Imbdb dataset."""

    def __init__(self, positives, negatives, word2idx, wv_model):

        self.dataset = positives + negatives
        self.word2idx = word2idx
     
        self.labels = [1 for _ in range(len(positives))] + [0 for _ in range(len(negatives))]
        self.w2v = wv_model
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = open(self.dataset[idx]).read()
        s = simple_preprocess(text)
              
        vec = [self.w2v.wv[v] for v in s if v in word2idx]
        
        vec1 = np.mean( vec, axis =0)
        assert( vec1.shape[0] == 200)
        vec2 = np.max( vec, axis =0)
        vec3 = np.mean( vec, axis =0)
        
        retvec = np.concatenate( [vec1, vec2,vec3])
            
        return retvec, torch.tensor( self.labels[idx], dtype=torch.float32)
            

In [8]:

train_dataset = TextDataset(positive_train[0:500],
                            negative_train[0:500],
                            word2idx, wv_model=model)

test_dataset = TextDataset(positive_test[0:500],
                           negative_test[0:500],
                           word2idx,  wv_model=model)

train_dataloader = DataLoader(train_dataset, batch_size=128,
                        shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=128,
                        shuffle=True, num_workers=0)


dataloaders = { "train": train_dataloader, "val":test_dataloader}

dataset_sizes = { "train": len(train_dataset), "val":len(test_dataset)}

train_dataset[0][0]-train_dataset[1][0]

array([-0.30463973,  0.42246544, -0.2912863 , -0.2783561 ,  0.42620432,
        0.05244523, -0.06872007, -0.03618291, -0.2044119 , -0.4681223 ,
       -0.03055123, -0.2830056 ,  0.26370284,  0.09876372,  0.09896849,
       -0.027643  ,  0.21756378,  0.49818033, -0.06463417,  0.462838  ,
        0.06896935,  0.22945541, -0.04939801, -0.6296438 ,  0.14896312,
        0.1400828 ,  0.44591957,  0.00647832, -0.02713466,  0.24756284,
        0.08056816, -0.05831452,  0.07209346,  0.03072605,  0.08622102,
       -0.05628349,  0.06678642, -0.19799863,  0.46162486,  0.09927669,
       -0.0160488 , -0.05710422, -0.10724166,  0.2518264 , -0.02858362,
        0.22745375, -0.14321557,  0.02267244, -0.05551359,  0.23109427,
       -0.10249305,  0.04312664,  0.18204601,  0.11684529, -0.16727039,
       -0.06585053,  0.20509255,  0.21890089,  0.03161347, -0.17956921,
       -0.08080181, -0.02933438, -0.41047508, -0.08545903,  0.22769296,
       -0.19049394, -0.03005798, -0.24399011, -0.309684  , -0.25

In [9]:
dataset_sizes

{'train': 1000, 'val': 1000}

In [10]:
import torch
import torch.nn as nn

In [11]:
class NNClassifier(nn.Module):
    def __init__(self, D, n_hidden):
        super(NNClassifier, self).__init__()
        
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()
        self.fc1 = nn.Linear( 3*D, n_hidden )
        self.fc2 = nn.Linear( n_hidden, 1)
        
    
    def forward(self, X):
        batch_size = X.shape[0]
        out = self.fc1(X)
        out = self.relu(out)
        out = self.dropout(out)
        ###
        out = self.fc2(out)
        out = torch.sigmoid(out)
        return out
        



In [12]:
def train_model(model, criterion, optimizer, dataloaders, scheduler,  num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        model.train()
        
            # Iterate over data.
        epoch_loss = []
        for inputs, labels in dataloaders["train"]:

            inputs = inputs.to(device)
            labels = labels.reshape(-1,1).to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
                    
         

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            epoch_loss.append( loss.item( ))
        
        
        model.eval()
        
        val_losses = []
        targets = [] 
        predicted  = []
        for inputs, labels in dataloaders["val"]:
        
            inputs = inputs.to(device)
            labels = labels.reshape(-1,1).to(device)
            
            outputs  = model(inputs)
            loss = criterion(outputs, labels) 
            
            preds = outputs.clone().detach()
            preds[preds<0.5]  = 0
            preds[preds>=0.5] = 1 
            
             
            
            targets.extend( list(labels.view(-1).numpy()))
            predicted.extend( list(preds.view(-1).numpy()))
            
            
            val_losses.append(  loss.item() )
        epoch_acc = accuracy_score( targets, predicted)
            
            
        if epoch_acc> best_acc:
            best_acc = epoch_acc
        print( f"epoch {epoch}, mean loss = {np.mean(epoch_loss):.3}, validation loss={np.mean(val_losses):.3}, epoch acc={epoch_acc:.3}")

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    #model.load_state_dict(best_model_wts)
    return model

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [15]:
net = NNClassifier(D=200, n_hidden = 100)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam( net.parameters())# lr=0.1)

train_model(net, criterion, optimizer, dataloaders, scheduler=None, num_epochs=50)

Epoch 0/49
----------
epoch 0, mean loss = 0.709, validation loss=0.685, epoch acc=0.509
Epoch 1/49
----------
epoch 1, mean loss = 0.688, validation loss=0.677, epoch acc=0.548
Epoch 2/49
----------
epoch 2, mean loss = 0.68, validation loss=0.667, epoch acc=0.649
Epoch 3/49
----------
epoch 3, mean loss = 0.667, validation loss=0.661, epoch acc=0.604
Epoch 4/49
----------
epoch 4, mean loss = 0.651, validation loss=0.652, epoch acc=0.638
Epoch 5/49
----------
epoch 5, mean loss = 0.645, validation loss=0.64, epoch acc=0.665
Epoch 6/49
----------
epoch 6, mean loss = 0.64, validation loss=0.639, epoch acc=0.644
Epoch 7/49
----------
epoch 7, mean loss = 0.632, validation loss=0.639, epoch acc=0.622
Epoch 8/49
----------
epoch 8, mean loss = 0.622, validation loss=0.64, epoch acc=0.619
Epoch 9/49
----------
epoch 9, mean loss = 0.62, validation loss=0.615, epoch acc=0.682
Epoch 10/49
----------
epoch 10, mean loss = 0.589, validation loss=0.603, epoch acc=0.686
Epoch 11/49
----------
e

NNClassifier(
  (relu): ReLU()
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=600, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)