## Word embeddings and sentiment

In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O ./data/dataset.tar.gz

--2020-12-03 11:05:02--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘./data/dataset.tar.gz’


2020-12-03 11:06:55 (736 KB/s) - ‘./data/dataset.tar.gz’ saved [84125825/84125825]



In [4]:
import numpy as np
import matplotlib.pyplot as plt
import glob

from gensim.utils import tokenize, deaccent, simple_preprocess
from collections import Counter
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def __init__(self, files):
        self.files = files
            
    def __iter__(self):
        for file in self.files:
            
            text = open( file ).read().lower()
            
            yield simple_preprocess(text)


In [6]:
positive_train = glob.glob("./data/aclImdb/train/pos/*.txt")
negative_train = glob.glob("./data/aclImdb/train/neg/*.txt")
#negative_train[0:5], positive_train[0:5]

positive_test = glob.glob("./data/aclImdb/test/pos/*.txt")
negative_test = glob.glob("./data/aclImdb/test/neg/*.txt")
len(negative_test), len(positive_test)

(12500, 12500)

In [7]:
sentences = MyCorpus(positive_train + negative_train)
for s in sentences:
    print(s)
    break

['for', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', 'imagine', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny', 'maureen', 'stapleton', 'is', 'scene', 'stealer', 'the', 'moroni', 'character', 'is', 'an', 'absolute', 'scream', 'watch', 'for', 'alan', 'the', 'skipper', 'hale', 'jr', 'as', 'police', 'sgt']


In [8]:
model = Word2Vec( min_count=5, workers=5, size=200) 
model.build_vocab(sentences)

In [9]:
def file_to_vector( text, model, D,  nwords=1000 ):
    words = simple_preprocess(open(text).read())[0:nwords]
    
    c = 0 
    v = np.zeros(D)
    for word in words:
        if word in model.wv:
            c +=1 
            v+= model.wv[word]
        
      
    return v/c

In [13]:
D = 200
nwords = 500


In [14]:
X_pos = np.zeros( (len(positive_train), D))
y_pos = np.ones( len(positive_train) )

for idx,f in enumerate(positive_train):
    X_pos[idx,:] = file_to_vector(f, model, D=D, nwords=nwords )

In [15]:
 
X_neg = np.zeros( (len(negative_train), D))
y_neg = np.zeros( len(negative_train) )

for idx,f in enumerate(negative_train):
    X_neg[idx,:] = file_to_vector(f, model, D=D, nwords=nwords )

In [16]:
 
X = np.concatenate( (X_pos, X_neg) , axis=0)
y= np.concatenate(  (y_pos, y_neg) , axis=0)

In [17]:
X = preprocessing.scale(X, axis=0) # zero mean, unit variance for each vector

In [20]:

X_train, X_test, y_train , y_test = train_test_split( X, y, random_state =42)
X_train.shape, X_test.shape

((18750, 200), (6250, 200))

In [21]:
clf = LogisticRegression(max_iter = 1500, random_state = 42,fit_intercept=True) 
clf.fit( X_train, y_train)

LogisticRegression(max_iter=1500, random_state=42)

In [22]:
preds = clf.predict(X_test)
accuracy_score(y_test, preds)

0.71808

In [23]:
import torch
import torch.nn as nn

In [332]:
MAX_EMBEDDING = 20
VOCAB_SIZE =  1000

class EmbeddingsClassifier(nn.Module):
    def __init__(self, D, vocab_size, max_embedding):
        super(EmbeddingsClassifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, D )
        self.conv1 = nn.Conv1d(max_embedding, 50, kernel_size=3, padding=1)
        #self.conv2 = nn.Conv1d(50, 25, kernel_size=3, padding=1)
        self.fc1 = nn.Linear( 50 *200, 1)
        
    
    def forward(self, X):
        batch_size = X.shape[0]
        out = self.embedding(X)
        out = self.conv1(out)
        out = out.view(batch_size, -1)
        out = self.fc1(out)
        out = torch.sigmoid(out)
        return out
        


In [333]:
MAX_EMBEDDING = 20
net = EmbeddingsClassifier(D=200, vocab_size=1000, max_embedding=MAX_EMBEDDING)

x_in = torch.randint( low=0, high=VOCAB_SIZE, size=(50,MAX_EMBEDDING))

x_in.dtype, x_in.shape
net(x_in).shape


torch.Size([50, 1])

In [334]:
word2idx= { w:i for i, w in enumerate(model.wv.vocab.keys()) }
idx2word = {i:w for w,i in word2idx.items()}

In [340]:
MAX_WORDS = 15
MAX_EMBEDDING = 15

In [341]:
positives = MyCorpus(positive_train)
pos = []
for s in positives:
    words = np.zeros( MAX_WORDS )
    for idx in range(min(MAX_WORDS, len(s))):    
        w =  word2idx.get(s[idx])
        words[idx] = w if w else 0
        
    pos.append( words[0:MAX_WORDS] )

negatives = MyCorpus(negative_train)
neg = []
for s in negatives:
    
    words = np.zeros( MAX_WORDS )
    for idx in range(min(MAX_WORDS, len(s))):    
        w =  word2idx.get(s[idx])
        words[idx] = w if w else 0
        
    neg.append( words[0:MAX_WORDS] )


In [368]:
positives = MyCorpus(positive_test)
pos_tests = []
for s in positives:
    words = np.zeros( MAX_WORDS )
    for idx in range(min(MAX_WORDS, len(s))):    
        w =  word2idx.get(s[idx])
        words[idx] = w if w else 0
        
    pos_tests.append( words[0:MAX_WORDS] )

negatives = MyCorpus(negative_test)
neg_tests = []
for s in negatives:
    
    words = np.zeros( MAX_WORDS )
    for idx in range(min(MAX_WORDS, len(s))):    
        w =  word2idx.get(s[idx])
        words[idx] = w if w else 0
        
    neg_tests.append( words[0:MAX_WORDS] )


In [369]:
len(pos_tests), len(neg_tests)

(12500, 12500)

In [358]:
train_set = pos + neg
labels = [1 for _ in range(12500)] + [0 for _ in range(12500)]

In [370]:
test_set = pos_tests + neg_tests
gt_labels = [1 for _ in range(len(pos_tests))] + [0 for _ in range(len(neg_tests))]
len(gt_labels)

25000

In [346]:
train_set[0]

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13.,  0.])

In [347]:
net = EmbeddingsClassifier(D=200, vocab_size=len(word2idx), max_embedding=MAX_EMBEDDING)


In [350]:

x_in = torch.randint( low=0, high=VOCAB_SIZE, size=(16,MAX_EMBEDDING))

x_in.dtype, x_in.shape
net(x_in).shape

torch.Size([16, 1])

In [351]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam( net.parameters())

In [356]:
NUM_EPOCHS = 5
BATCH_SIZE = 5
epoch_loss = []
accs = []
for epoch in range(NUM_EPOCHS):
    losses = []
    
    net.train()
    for i in range(0, X_train.shape[0], BATCH_SIZE):
    #for i in range(0, 100, BATCH_SIZE):
        
       
            
        optimizer.zero_grad()
        x_in = torch.tensor( train_set[i:i+BATCH_SIZE], dtype=torch.int64 )
        target = torch.tensor( labels[i:i+BATCH_SIZE], dtype=torch.float32).reshape(x_in.shape[0],1)
        ret = net(x_in)
        
        loss = criterion(ret, target)

        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    

    epoch_loss.append(np.mean(losses))
    
    model.eval()
    for i in range(0, X_test.shape[0], BATCH_SIZE):
            
        optimizer.zero_grad()
        x_in = torch.tensor( test_set[i:i+BATCH_SIZE], dtype=torch.int64 )
        target = torch.tensor( gt_labels[i:i+BATCH_SIZE], dtype=torch.float32).reshape(x_in.shape[0],1)
        ret = net(x_in)
        
        loss = criterion(ret, target)

        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    
    
    #if epoch % 5 == 0 :
    print( f"epoch {epoch}, train loss={np.mean(epoch_loss):.3}")
    

  x_in = torch.tensor( train_set[i:i+BATCH_SIZE], dtype=torch.int64 )


epoch 0, train loss=0.214
epoch 1, train loss=0.249
epoch 2, train loss=0.284


KeyboardInterrupt: 

## ret.shape

In [272]:
x_in.shape

torch.Size([5, 15])

## target = torch.ones(5, dtype=torch.float32).reshape(5,1)
target

In [230]:
criterion(ret, target )

tensor(0.7038, grad_fn=<BinaryCrossEntropyBackward>)

In [264]:
x_in.shape

torch.Size([5, 15])

In [363]:
from sklearn.metrics import accuracy_score

In [381]:
net.eval()
preds = [] 
for i in range(0, len(test_set), BATCH_SIZE):

    x_in = torch.tensor( test_set[i:i+BATCH_SIZE], dtype=torch.int64 )
    target = torch.tensor( gt_labels[i:i+BATCH_SIZE], dtype=torch.float32).reshape(x_in.shape[0],1)
    ret = list( net(x_in).detach().numpy() )
    
    preds.extend(ret)

preds = [1 if r >0.5 else 0 for r in preds]

  x_in = torch.tensor( test_set[i:i+BATCH_SIZE], dtype=torch.int64 )


In [382]:
len(preds), len(gt_labels)

(25000, 25000)

In [384]:
accuracy_score(gt_labels,preds)

0.50364

[array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([6.7851566e-12], dtype=float32),
 array([1.], dtype=float32),
 array([0.9948197], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([1.819542e-12], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 array([0.6484883], dtype=float32),
 array([1.], dtype=float32),
 array([0.9999988], dtype=float32),
 array([1.], dtype=float32),
 array([1.], dtype=float32),
 