# Sentiment Analysis of movie reviews on IMDB 

Implement the 'FastText' model from the paper https://arxiv.org/abs/1607.01759

# 1. Preparing Data

In [1]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(''.join(n_gram))
    return x

In [2]:
import torch
from torchtext import data, datasets

SEED=1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic= True

# python -m spacy download en / (run as Admin)
TEXT = data.Field(tokenize='spacy',preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)

In [3]:
train_da, test_da = datasets.IMDB.splits(TEXT,LABEL)

In [4]:
import random

train_da, valid_da = train_da.split(random_state=random.seed(SEED))

Pre-trained word embeddings: glove.6B.100d (6 billion tokens and 100-dimensional)
https://nlp.stanford.edu/projects/glove/

In [5]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_da, max_size=MAX_VOCAB_SIZE,
                vectors='glove.6B.100d',
                unk_init=torch.Tensor.normal_)
# By default, TorchText will initialize words only in your vocab (if not, then to zero). Here we
# instead initialize them randomly via Gaussian distribution.

In [6]:
LABEL.build_vocab(train_da)

In [7]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator =data.BucketIterator.splits((train_da,valid_da,test_da),
                                                        batch_size=BATCH_SIZE,device=device)


# 2. Build the Model

only 2 layers that have parameters - the embedding layer and the linear layer.

In [8]:
import torch.nn as nn
import torch.nn.functional as F

In [9]:
class FastText(nn.Module):
    def __init__(self,vocab_size,embedding_dim,output_dim,pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim,output_dim)
    def forward(self,text):
        embedded = self.embedding(text) # [sent len, batch size, emb dim]
        embedded = embedded.permute(1,0,2) # [batch_size,sent len, emb dim]
        pooled = F.avg_pool2d(embedded,(embedded.shape[1],1)).squeeze(1)
        return self.fc(pooled)

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM=100
OUTPUT_DIM=1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = FastText(INPUT_DIM,EMBEDDING_DIM,OUTPUT_DIM,PAD_IDX)

In [11]:
pretrained_embeddings = TEXT.vocab.vectors # copy the pretrained vectors to embedding layer
model.embedding.weight.data.copy_(pretrained_embeddings) 

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7411,  0.1331, -1.6400,  ..., -0.6794, -1.0600, -0.3000],
        [ 0.1916, -0.0863,  1.2607,  ...,  0.1339,  1.6436,  0.0857],
        [-0.4685,  0.1938, -0.3339,  ...,  1.1546,  1.0953,  0.3244]])

In [12]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# zero the initial weight of both unknown and padding tokens
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# 3. Train the Model

In [13]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion=criterion.to(device)

In [14]:
def binary_accuracy(preds,y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [15]:
def train(model,iterator,optimizer,criterion):
    epoch_loss=0
    epoch_acc=0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions,batch.label)
        acc = binary_accuracy(predictions,batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [16]:
def evaluate(model,iterator,criterion):
    epoch_loss=0
    epoch_acc=0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions,batch.label)
            acc = binary_accuracy(predictions,batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [17]:
N_EPOCHS = 5

best_valid_loss=float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model,train_iterator,optimizer,criterion)
    valid_loss, valid_acc = evaluate(model,valid_iterator,criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(),'C:/Users/timsh/FastText.pt')
        
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: 0.686 | Train Acc: 60.50%
	 Val. Loss: 0.631 |  Val. Acc: 70.03%
	Train Loss: 0.645 | Train Acc: 74.67%
	 Val. Loss: 0.518 |  Val. Acc: 75.08%
	Train Loss: 0.571 | Train Acc: 79.73%
	 Val. Loss: 0.435 |  Val. Acc: 80.27%
	Train Loss: 0.496 | Train Acc: 83.98%
	 Val. Loss: 0.392 |  Val. Acc: 83.48%
	Train Loss: 0.431 | Train Acc: 86.95%
	 Val. Loss: 0.372 |  Val. Acc: 85.98%


In [18]:
model.load_state_dict(torch.load('C:/Users/timsh/FastText.pt'))
test_loss, test_acc = evaluate(model,test_iterator,criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.386 | Test Acc: 85.28%


# 4. User input test

In [19]:
import spacy
nlp=spacy.load('en')

def predict_sentiment(model,sentence):
    model.eval()
    tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [20]:
predict_sentiment(model,'This film is terrible')

1.0740157652122662e-09

In [21]:
predict_sentiment(model,'This film is great')

1.0