## Embeddings

We intend to experiment with sentiment analysis (sentence classification) using different word
embeddings and the Sentiment140 dataset containing labeled Twitter tweets. More specifically,
we intend to train our own word embeddings using word2vec and then use those with a deep
learning sentiment analysis model (TBD: LSTM/GRU/Transformer) that we will create using
Pytorch. The main experiments we will perform are
- Use Pytorch’s trainable embeddings with random initialization
- Use Pytorch’s trainable embeddings with our trained word2vec initialization
- Only use our trained word2vec embeddings as inputs

and then compare the accuracy results for the sentiment analysis task.

I would suggest training several sets of your own embeddings (experiment with the parameters to see how they influence the final vectors). Then, compare the sets of embeddings outside of your system (analogies,odd-one-out...), so you can set some expectations about what embeddings might yield the best result for your task. Finally, look at how the vectors perform in your system and analyze if you expected such result and why.

- https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

- https://github.com/hietalajulius/deep-learning-aalto/blob/master/Classifier.ipynb

- https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis

In [1]:
from collections import Counter
import csv
import math
import numpy as np
import pandas as pd
import spacy

import torch
import torchtext
import torchtext.vocab
from torchtext import datasets

In [13]:
df_train = pd.read_csv('data/processed_train.csv')
print(df_train.shape)
df_train.head()

(1280000, 2)


Unnamed: 0,target,text
0,0,yeah hmmmm lay low guess u ever wait till last...
1,1,excit new everyday sunday cd relea today serio...
2,1,good morn tweep tworld malibu string bikini fa...
3,0,feel sooo sick stress exam tomorrow
4,0,know feel get cowork sick oh well think got co...


## Pre-trained word embeddings
- word2Vec
- Glove

In [None]:
glove = torchtext.vocab.Glove(name='6B', dim=100)

In [None]:
def get_vector(embeddings, word):
    
    return embeddings.vectors(embeddings.stoi[word])


def closest(embeddings, vector, n=6):
    disntances = []
    for neighbor in embeddings.itos:
        distances.append(neighbor, torch.dist(vector, get_vactor(embeddings, neighbor)))
    
    return sorted(distances, key=lambda x: x[1])[:n]


def analogy(embeddings, w1, w2, w3, n=6):
    
    closest_words = closest(embeddings,
                           get_vector(embeddings, w2) \
                            - get_vector(embeddings, w1) \
                            + get_vector(embeddings, w3),
                           n + 3)
    closest_words = [x for x in closest_words if x[0] not in [w1, w2, w3]][:n]
    
    return closest_words

In [None]:
closest(glove, get_vector(glove, 'paper'))

In [None]:
analogy(glove, 'moon', 'night', 'sun')

## Build vocab

In [15]:
"""
TEXT = torchtext.data.Field(tokenize= 'spacy',
                            init_token='< sos >',
                            eos_token='< eos >',
                            unk_token='< unk >',
                            tokenizer_language='en_core_web_sm',
                            lower=True)
"""
TEXT = torchtext.data.Field(tokenize= 'spacy',
                            tokenizer_language='en_core_web_sm',
                            lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

datafields = [('Sentiment', LABEL), ('SentimentText', TEXT)]

train, test = torchtext.data.TabularDataset.splits(path='data/',
                                                  train='processed_train.csv',
                                                  test='processed_test.csv',
                                                  format='csv',
                                                  skip_header=True,
                                                  fields=datafields)



In [16]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print(TEXT.vocab.freqs.most_common(20))

print(TEXT.vocab.itos[:10])

print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2
[('go', 110885), ('get', 88497), ('day', 84970), ('good', 74173), ('work', 70162), ('like', 66701), ('love', 66258), ('quot', 58785), ('got', 56871), ('today', 54828), ('time', 52958), ('nt', 49662), ('thank', 47606), ('lol', 47584), ('back', 46035), ('want', 45952), ('one', 45750), ('miss', 45481), ('i', 45402), ('u', 44507)]
['<unk>', '<pad>', 'go', 'get', 'day', 'good', 'work', 'like', 'love', 'quot']
defaultdict(None, {'0': 0, '1': 1})


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# minimise badding for each sentence
train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                (train, test),
                                batch_size=64,
                                sort_key=lambda x: len(x.SentimentText),
                                sort_within_batch=False,
                                device=device)

cuda


## RNN model

In [18]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, 
                          hidden_dim,
                         num_layers=n_layers,
                         bidirectional=bidirectional,
                         dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded_text = self.dropout(self.embedding(text))
        output, hidden = self.gru(embedded_text)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        x = self.fc(hidden.squeeze(0))
        return x

In [19]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = GRU(vocab_size=INPUT_DIM, 
            embedding_dim=EMBEDDING_DIM, 
            hidden_dim=HIDDEN_DIM, 
            output_dim=OUTPUT_DIM, 
            n_layers=2,
            bidirectional=True,
            dropout=0.1)
print(model)

GRU(
  (embedding): Embedding(25002, 100)
  (gru): GRU(100, 256, num_layers=2, dropout=0.1, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [None]:
# Use pretrained embeddings
"""
#pretrained_embeddings = TEXT.vocab.vectors
#model.embedding.weight.data.copy_(pretrained_embeddings)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)
"""

In [20]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [22]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.SentimentText).squeeze(1)
        
        loss = criterion(predictions, batch.Sentiment)
        
        acc = binary_accuracy(predictions, batch.Sentiment)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.SentimentText).squeeze(1)
            
            loss = criterion(predictions, batch.Sentiment)
            
            acc = binary_accuracy(predictions, batch.Sentiment)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'sent_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')