## Embeddings

We intend to experiment with sentiment analysis (sentence classification) using different word
embeddings and the Sentiment140 dataset containing labeled Twitter tweets. More specifically,
we intend to train our own word embeddings using word2vec and then use those with a deep
learning sentiment analysis model (TBD: LSTM/GRU/Transformer) that we will create using
Pytorch. The main experiments we will perform are
- Use Pytorch’s trainable embeddings with random initialization
- Use Pytorch’s trainable embeddings with our trained word2vec initialization
- Only use our trained word2vec embeddings as inputs

and then compare the accuracy results for the sentiment analysis task.

I would suggest training several sets of your own embeddings (experiment with the parameters to see how they influence the final vectors). Then, compare the sets of embeddings outside of your system (analogies,odd-one-out...), so you can set some expectations about what embeddings might yield the best result for your task. Finally, look at how the vectors perform in your system and analyze if you expected such result and why.

- https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

- https://github.com/hietalajulius/deep-learning-aalto/blob/master/Classifier.ipynb

- https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis

In [1]:
from collections import Counter
import csv
import math
import numpy as np
import pandas as pd
import spacy

import torch
import torchtext
import torchtext.vocab
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
df_train = pd.read_csv('data/processed_train.csv')
print(df_train.shape)
df_train.head()

(1024000, 2)


Unnamed: 0,target,text
0,1,oop wrong url thing work mind brain foggi life
1,1,yes fantast
2,1,pretttyyi pleaseee tweet mileycsupport reallll...
3,0,yep heard everi sad song twitter safe say far ...
4,1,oh got blush like littl girl


## Pre-trained word embeddings
- word2Vec
- Glove

## Build vocab

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/C%20-%20Loading%2C%20Saving%20and%20Freezing%20Embeddings.ipynb

In [3]:
TEXT = torchtext.data.Field(tokenize= 'spacy',
                            tokenizer_language='en_core_web_sm',
                            lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

datafields = [('Sentiment', LABEL), ('SentimentText', TEXT)]

train, val, test = torchtext.data.TabularDataset.splits(path='data/',
                                                  train='processed_train.csv',
                                                  validation='processed_val.csv',
                                                  test='processed_test.csv',
                                                  format='csv',
                                                  skip_header=True,
                                                  fields=datafields)

## Use pretrained embeddings glove

In [4]:
MAX_VOCAB_SIZE=10000
TEXT.build_vocab(train, vectors=GloVe(name="6B", dim=100),
                 max_size=10000, min_freq=10)
LABEL.build_vocab(train)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print(TEXT.vocab.freqs.most_common(20))

print(TEXT.vocab.itos[:10])

print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 2
[('go', 88779), ('get', 70792), ('day', 67839), ('good', 59384), ('work', 56051), ('like', 53493), ('love', 52734), ('quot', 47034), ('got', 45559), ('today', 43640), ('time', 42450), ('nt', 39689), ('lol', 38152), ('thank', 38138), ('back', 36807), ('want', 36771), ('one', 36690), ('i', 36474), ('miss', 36311), ('u', 35543)]
['<unk>', '<pad>', 'go', 'get', 'day', 'good', 'work', 'like', 'love', 'quot']
defaultdict(None, {'0': 0, '1': 1})


## Create own vocab from our train dataset tweets

In [6]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE, min_freq=10)
LABEL.build_vocab(train)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print(TEXT.vocab.freqs.most_common(20))

print(TEXT.vocab.itos[:10])

print(LABEL.vocab.stoi)

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 1
[('go', 6658), ('work', 5206), ('get', 4904), ('day', 4589), ('miss', 3484), ('nt', 3416), ('like', 3383), ('want', 3272), ('today', 3248), ('back', 3125), ('got', 3075), ('feel', 2861), ('time', 2672), ('i', 2556), ('good', 2452), ('realli', 2422), ('m', 2368), ('still', 2367), ('wish', 2268), ('sad', 2267)]
['<unk>', '<pad>', 'go', 'work', 'get', 'day', 'miss', 'nt', 'like', 'want']
defaultdict(None, {'0': 0})


In [13]:
def get_vector(embeddings, word):
    
    return embeddings.vectors(embeddings.stoi[word])


def closest(embeddings, vector, n=6):
    disntances = []
    for neighbor in embeddings.itos:
        distances.append(neighbor, torch.dist(vector, get_vactor(embeddings, neighbor)))
    
    return sorted(distances, key=lambda x: x[1])[:n]


def analogy(embeddings, w1, w2, w3, n=6):
    
    closest_words = closest(embeddings,
                           get_vector(embeddings, w2) \
                            - get_vector(embeddings, w1) \
                            + get_vector(embeddings, w3),
                           n + 3)
    closest_words = [x for x in closest_words if x[0] not in [w1, w2, w3]][:n]
    
    return closest_words


"""
glove = vocab.Glove(name='6B', dim=100)


closest(glove, get_vector(glove, 'paper'))

analogy(glove, 'moon', 'night', 'sun')


"""

"\nglove = vocab.Glove(name='6B', dim=100)\n\n\nclosest(glove, get_vector(glove, 'paper'))\n\nanalogy(glove, 'moon', 'night', 'sun')\n\nown_embeddings = TEXT.vocab\n\nclosest(own_embeddings, get_vector(own_embeddings, 'paper'))\n\nanalogy(own_embeddings, 'moon', 'night', 'sun')\n\n"

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# minimise badding for each sentence
train_iterator, val_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                (train, val, test),
                                batch_size=64,
                                sort_key=lambda x: len(x.SentimentText),
                                sort_within_batch=False,
                                device=device)

cuda


## RNN model

In [6]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                output_dim, n_layers, bidirectional, dropout, drop_out_dense=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, 
                          hidden_dim,
                         num_layers=n_layers,
                         bidirectional=bidirectional,
                         dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(drop_out_dense)
    
    def forward(self, text):
        embedded_text = self.dropout(self.embedding(text))
        output, hidden = self.gru(embedded_text)
        hidden = self.dropout2(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        x = self.fc(hidden.squeeze(0))
        return x

In [24]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
n_layers = 2

model = GRU(vocab_size=INPUT_DIM, 
            embedding_dim=EMBEDDING_DIM, 
            hidden_dim=HIDDEN_DIM, 
            output_dim=OUTPUT_DIM, 
            n_layers=n_layers,
            bidirectional=True,
            dropout=0.1)
print(model)

GRU(
  (embedding): Embedding(10002, 100)
  (gru): GRU(100, 256, num_layers=2, dropout=0.1, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
)


In [25]:
# Use pretrained embeddings

#pretrained_embeddings = TEXT.vocab.vectors
#model.embedding.weight.data.copy_(pretrained_embeddings)

unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[pad_idx] = torch.zeros(EMBEDDING_DIM)


In [26]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [27]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [28]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.SentimentText).squeeze(1)
        
        loss = criterion(predictions, batch.Sentiment)
        
        acc = binary_accuracy(predictions, batch.Sentiment)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return model, epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            # print(batch.SentimentText)
            if batch.SentimentText.nelement() > 0:
                predictions = model(batch.SentimentText).squeeze(1)

                loss = criterion(predictions, batch.Sentiment)

                acc = binary_accuracy(predictions, batch.Sentiment)

                epoch_loss += loss.item()
                epoch_acc += acc.item()
            #else:
                #print(f"Found a non-empty Tensorlist {batch.SentimentText}")
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 10

best_valid_loss = float('inf')

#freeze embeddings
model.embedding.weight.requires_grad = False

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    model, train_loss, train_acc  = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'sent_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 16m 25s
	Train Loss: 0.687 | Train Acc: 54.74%
	 Val. Loss: 0.676 |  Val. Acc: 57.75%
Epoch: 02 | Epoch Time: 16m 31s
	Train Loss: 0.673 | Train Acc: 58.25%
	 Val. Loss: 0.661 |  Val. Acc: 59.43%
Epoch: 03 | Epoch Time: 16m 34s
	Train Loss: 0.663 | Train Acc: 59.77%
	 Val. Loss: 0.652 |  Val. Acc: 60.82%
Epoch: 04 | Epoch Time: 16m 36s
	Train Loss: 0.655 | Train Acc: 60.96%
	 Val. Loss: 0.644 |  Val. Acc: 61.89%
Epoch: 05 | Epoch Time: 16m 31s
	Train Loss: 0.646 | Train Acc: 62.25%
	 Val. Loss: 0.633 |  Val. Acc: 63.44%
Epoch: 06 | Epoch Time: 16m 32s
	Train Loss: 0.630 | Train Acc: 64.36%
	 Val. Loss: 0.612 |  Val. Acc: 65.74%
Epoch: 07 | Epoch Time: 16m 30s
	Train Loss: 0.602 | Train Acc: 67.29%
	 Val. Loss: 0.582 |  Val. Acc: 68.57%
Epoch: 08 | Epoch Time: 16m 31s
	Train Loss: 0.578 | Train Acc: 69.65%
	 Val. Loss: 0.564 |  Val. Acc: 70.53%
Epoch: 09 | Epoch Time: 16m 30s
	Train Loss: 0.564 | Train Acc: 70.85%
	 Val. Loss: 0.556 |  Val. Acc: 71.30%
Epoch: 10 

In [31]:
model.load_state_dict(torch.load('sent_model.pt'))
print(model)

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

GRU(
  (embedding): Embedding(10002, 100)
  (gru): GRU(100, 256, num_layers=2, dropout=0.1, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
)
Test Loss: 0.551 | Test Acc: 72.20%


In [32]:
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [33]:
predict_sentiment(model, "This film is great")

0.8100703954696655

## Confusion matrix

In [53]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    n = len(y)
    return correct, n

def evaluate_cm(model, iterator, criterion):
    
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            # print(batch.SentimentText)
            if batch.SentimentText.nelement() > 0:
                predictions = model(batch.SentimentText).squeeze(1)

                acc = binary_accuracy(predictions, batch.Sentiment)

                epoch_loss += loss.item()
                epoch_acc += acc.item()
            else:
                print(f"Found a non-empty Tensorlist {batch.SentimentText}")
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



In [45]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [52]:
df_test = pd.read_csv('data/processed_test.csv')

y_pred_1d = []
y_test_1d = list(df_test.target)
scores = [predict_sentiment(model, sentence) for sentence in df_test.text]


y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]

TypeError: Argument 'string' has incorrect type (expected str, got float)

In [44]:
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [None]:
%%time

cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(12,12))
plot_confusion_matrix(cnf_matrix, classes=df_train.target.unique(), title="Confusion matrix")
plt.show()