## Word embeddings and sentiment

In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O ./data/dataset.tar.gz

--2020-12-03 11:05:02--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘./data/dataset.tar.gz’


2020-12-03 11:06:55 (736 KB/s) - ‘./data/dataset.tar.gz’ saved [84125825/84125825]



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import glob

from gensim.utils import tokenize, deaccent, simple_preprocess
from collections import Counter
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
import time
import copy

In [2]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def __init__(self, files):
        self.files = files
            
    def __iter__(self):
        for file in self.files:
            
            text = open( file ).read().lower()
            
            yield simple_preprocess(text)


In [3]:
positive_train = glob.glob("./data/aclImdb/train/pos/*.txt")
negative_train = glob.glob("./data/aclImdb/train/neg/*.txt")
#negative_train[0:5], positive_train[0:5]

positive_test = glob.glob("./data/aclImdb/test/pos/*.txt")
negative_test = glob.glob("./data/aclImdb/test/neg/*.txt")
len(negative_test), len(positive_test)

(12500, 12500)

In [4]:
sentences = MyCorpus(positive_train + negative_train)
for s in sentences:
    print(s)
    break

['for', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', 'imagine', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny', 'maureen', 'stapleton', 'is', 'scene', 'stealer', 'the', 'moroni', 'character', 'is', 'an', 'absolute', 'scream', 'watch', 'for', 'alan', 'the', 'skipper', 'hale', 'jr', 'as', 'police', 'sgt']


In [5]:
model = Word2Vec( min_count=5, workers=5, size=200) 
model.build_vocab(sentences)

In [6]:
def file_to_vector( text, model, D,  nwords=1000 ):
    words = simple_preprocess(open(text).read())[0:nwords]
    
    c = 0 
    v = np.zeros(D)
    for word in words:
        if word in model.wv:
            c +=1 
            v+= model.wv[word]
        
      
    return v/c

In [7]:
D = 200
nwords = 500


In [8]:
X_pos = np.zeros( (len(positive_train), D))
y_pos = np.ones( len(positive_train) )

for idx,f in enumerate(positive_train):
    X_pos[idx,:] = file_to_vector(f, model, D=D, nwords=nwords )

In [9]:
 
X_neg = np.zeros( (len(negative_train), D))
y_neg = np.zeros( len(negative_train) )

for idx,f in enumerate(negative_train):
    X_neg[idx,:] = file_to_vector(f, model, D=D, nwords=nwords )

In [10]:
 
X = np.concatenate( (X_pos, X_neg) , axis=0)
y= np.concatenate(  (y_pos, y_neg) , axis=0)

In [11]:
X = preprocessing.scale(X, axis=0) # zero mean, unit variance for each vector

In [12]:

X_train, X_test, y_train , y_test = train_test_split( X, y, random_state =42)
X_train.shape, X_test.shape

((18750, 200), (6250, 200))

In [13]:
clf = LogisticRegression(max_iter = 1500, random_state = 42,fit_intercept=True) 
clf.fit( X_train, y_train)

LogisticRegression(max_iter=1500, random_state=42)

In [14]:
preds = clf.predict(X_test)
accuracy_score(y_test, preds)

0.7232

In [26]:
import torch
import torch.nn as nn

In [27]:
class NNClassifier(nn.Module):
    def __init__(self, D, n_hidden):
        super(NNClassifier, self).__init__()
        
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()
        self.fc1 = nn.Linear( D, n_hidden )
        self.fc2 = nn.Linear( n_hidden, 1)
        
    
    def forward(self, X):
        batch_size = X.shape[0]
        out = self.fc1(X)
        out = self.relu(out)
        out = self.fc2(out)
        out = torch.sigmoid(out)
        return out
        



In [28]:


net = NNClassifier(D=200, n_hidden = 100)

x_in = torch.rand( size=(50, 200))
x_in.dtype, x_in.shape
#net(x_in)

(torch.float32, torch.Size([50, 200]))

In [29]:
word2idx= { w:i for i, w in enumerate(model.wv.vocab.keys()) }
idx2word = {i:w for w,i in word2idx.items()}

In [30]:
positive_train[0]

'./data/aclImdb/train/pos/4715_9.txt'

In [33]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    """Imbdb dataset."""

    def __init__(self, positives, negatives, word2idx, wv_model):

        self.dataset = positives + negatives
        self.word2idx = word2idx
     
        self.labels = [1 for _ in range(len(positives))] + [1 for _ in range(len(negatives))]
        self.w2v = wv_model
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = open(self.dataset[idx]).read()
        s = simple_preprocess(text)
              
        vec = [self.w2v.wv[v] for v in s if v in word2idx]
        
        vec = np.mean( vec, axis =0)
        assert( vec.shape[0] == 200)
        
            
        return vec, torch.tensor( self.labels[idx], dtype=torch.float32)
            
train_dataset = TextDataset(positive_train, negative_train, word2idx, wv_model=model)
test_dataset = TextDataset(positive_test, negative_test, word2idx,  wv_model=model)

train_dataloader = DataLoader(train_dataset, batch_size=4,
                        shuffle=True, num_workers=0)

test_dataloader = DataLoader(train_dataset, batch_size=4,
                        shuffle=True, num_workers=0)


dataloaders = { "train": train_dataloader, "val":test_dataloader}

dataset_sizes = { "train": len(train_dataset), "val":len(test_dataset)}
train_dataset[0]


(array([ 4.49688989e-04, -4.01483645e-04, -2.59369175e-04, -4.35122609e-04,
         3.08975403e-04,  5.63730173e-05,  2.92383687e-04, -2.82398891e-04,
         4.41822747e-04, -6.36014374e-06,  6.39296559e-05, -3.48708818e-05,
        -1.64607816e-04,  3.76173120e-05, -1.02190672e-04,  4.68529470e-04,
        -4.02406877e-04,  7.83308715e-05, -1.93617045e-04, -2.47308082e-04,
        -2.07903184e-04, -6.85326086e-05, -3.91543435e-04, -2.35021886e-04,
        -6.11755240e-05, -3.52100789e-04,  1.34786669e-05,  1.35929600e-04,
        -3.67138360e-04, -2.85615912e-04, -4.19135700e-04, -3.44096552e-05,
         1.75669207e-04,  2.70006276e-04,  3.66209897e-05, -1.51392465e-04,
        -3.03276582e-04, -3.85350577e-05, -3.07914743e-04, -4.32111963e-04,
        -2.55237159e-04,  6.79154255e-05, -1.91538493e-04,  2.19883204e-05,
         5.11856197e-05,  2.13449719e-04, -2.10035534e-04, -2.43975839e-04,
         2.46141804e-04,  3.90571979e-04,  3.24069610e-04, -3.62127117e-04,
         3.8

In [34]:

x_in = torch.rand( size=(16,200))

x_in.dtype, x_in.shape#net(x_in).shape

(torch.float32, torch.Size([16, 200]))

In [35]:
net = NNClassifier(D=200, n_hidden = 100)


In [36]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam( net.parameters())

In [37]:
def train_model(model, criterion, optimizer, dataloaders, scheduler,  num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                
                
                inputs = inputs.to(device)
                labels = labels.reshape(-1,1).to(device)
                
                print("inputs shape=>", inputs.shape)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                if scheduler:
                    scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [38]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
train_model(net, criterion, optimizer, dataloaders, scheduler=None)

In [564]:
2

2