In [1]:
# -*- coding: utf-8 -*-
"""
   Introduction to Deep Learning
   Assignment 3: Sentiment Classification of Tweets on a Recurrent Neural Network using Pretrained Embeddings

   Hande Celikkanat

   Credit: Data preparation pipeline adopted from https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8
"""

'\n   Introduction to Deep Learning\n   Assignment 3: Sentiment Classification of Tweets on a Recurrent Neural Network using Pretrained Embeddings\n\n   Hande Celikkanat\n\n   Credit: Data preparation pipeline adopted from https://medium.com/@sonicboom8/sentiment-analysis-torchtext-55fb57b1fab8\n'

In [2]:
import sys
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import spacy
import regex as re
from torchtext.legacy import vocab
from torchtext.legacy import data
import time
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [3]:
# Constants - Add here as you wish
BATCH_SIZE = 50
N_EPOCHS = 50
EMBEDDING_DIM = 200
OUTPUT_DIM = 2

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
# Auxilary functions for data preparation
tok = spacy.load('en_core_web_sm',disable=['parser', 'tagger', 'ner'])
def tokenizer(s):
    return [w.text.lower() for w in tok(tweet_clean(s))]

def tweet_clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
    return text.strip()

In [6]:
# Evaluation functions
def get_accuracy(output, gold):
    predictions = (output >= 0.5).type(torch.uint8)
    correct = torch.sum(torch.eq(predictions, gold)).item()
    acc = correct / gold.shape[0]
    return acc


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    with torch.no_grad():
        for batch in iterator:
            
            texts, text_lengths = batch.TweetText
            texts_T = texts.transpose(0,1)
            predictions = model(texts_T, text_lengths).squeeze(1)
            labels = batch.Label
            loss = criterion(predictions.float(), labels.float())

            acc = get_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [7]:
# Utility
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [8]:
class RNN(nn.Module):
    def __init__(self, embedding_length, embedding_dim, hidden_size, num_hidden_layers):
        super().__init__()
        self.embedding_length = embedding_length
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.embedding = nn.Embedding(num_embeddings=embedding_length, embedding_dim=EMBEDDING_DIM)
        self.rnn = nn.RNN(
            input_size=embedding_dim, 
            hidden_size=hidden_size, 
            num_layers=num_hidden_layers, 
            bidirectional=True
        )
        self.fc1 = nn.Linear(2 * hidden_size, 256)
        self.fc2 = nn.Linear(256, 100)
        self.fc3 = nn.Linear(100,1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, texts, text_lengths: torchtext.legacy.data.Field):
        x = self.embedding(texts)
        packed_embedded = pack_padded_sequence(x, text_lengths.cpu(), batch_first=True)
        packed_output, _ = self.rnn(packed_embedded)
        output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        
        out_forward = output[range(len(output)), text_lengths - 1, : self.hidden_size]
        out_reverse = output[:, 0, self.hidden_size : ]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        
        # output = torch.squeeze(h_t)
        out = self.fc1(out_reduced)
        out = self.fc2(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out
        

In [9]:
# --- Data Preparation ---

# define the columns that we want to process and how to process
txt_field = torchtext.legacy.data.Field(sequential=True,
                                 tokenize=tokenizer,
                                 include_lengths=True,
                                 use_vocab=True)
label_field = torchtext.legacy.data.Field(sequential=False,
                                   use_vocab=False)

csv_fields = [
    ('Label', label_field), # process this field as the class label
    ('TweetID', None), # we dont need this field
    ('Timestamp', None), # we dont need this field
    ('Flag', None), # we dont need this field
    ('UseerID', None), # we dont need this field
    ('TweetText', txt_field) # process it as text field
]

train_data, dev_data, test_data = torchtext.legacy.data.TabularDataset.splits(path='../data',
                                                                       format='csv',
                                                                       train='sent140.train.mini.csv',
                                                                       validation='sent140.dev.csv',
                                                                       test='sent140.test.csv',
                                                                       fields=csv_fields,
                                                                       skip_header=False)


txt_field.build_vocab(
    train_data, 
    dev_data,
    max_size=100000, 
    vectors='glove.twitter.27B.200d', 
    unk_init = torch.Tensor.normal_
)

label_field.build_vocab(train_data)



In [10]:
train_iter, dev_iter, test_iter = torchtext.legacy.data.BucketIterator.splits(datasets=(train_data, dev_data, test_data),
                                            batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),  # batch sizes of train, dev, test
                                            sort_key=lambda x: len(x.TweetText), # how to sort text
                                            device=device,
                                            sort_within_batch=True,
                                            repeat=False)

In [11]:
# --- Model, Loss, Optimizer Initialization ---

PAD_IDX = txt_field.vocab.stoi[txt_field.pad_token]
UNK_IDX = txt_field.vocab.stoi[txt_field.unk_token]

# WRITE CODE HERE
HIDDEN_SIZE = 50
NUM_LAYERS_HIDDEN = 8

model = RNN(
    embedding_length=len(txt_field.vocab), 
    embedding_dim=EMBEDDING_DIM, 
    hidden_size=HIDDEN_SIZE, 
    num_hidden_layers=NUM_LAYERS_HIDDEN
)

# Copy the pretrained embeddings into the model
pretrained_embeddings = txt_field.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Fix the <UNK> and <PAD> tokens in the embedding layer
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# WRITE CODE HERE
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCELoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
# --- Train Loop ---
for epoch in range(N_EPOCHS):
    start_time = time.time()
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    
    for batch in train_iter:
        optimizer.zero_grad()
        texts, text_lengths = batch.TweetText
        # Switch batch_size to first
        texts_T = texts.transpose(0,1)
        predictions = model(texts_T, text_lengths).squeeze(1)
        
        labels = batch.Label
        loss = criterion(predictions.float(), labels.float())
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.detach().cpu()
        epoch_acc += get_accuracy(predictions, labels)
    

    train_loss, train_acc = (epoch_loss / len(train_iter), epoch_acc / len(train_iter))
    valid_loss, valid_acc = evaluate(model, dev_iter, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 3s
	Train Loss: 0.618 | Train Acc: 64.61%
	 Val. Loss: 0.543 |  Val. Acc: 72.26%
Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 0.526 | Train Acc: 73.90%
	 Val. Loss: 0.531 |  Val. Acc: 73.35%
Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 0.479 | Train Acc: 77.15%
	 Val. Loss: 0.508 |  Val. Acc: 75.39%
Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 0.446 | Train Acc: 79.26%
	 Val. Loss: 0.499 |  Val. Acc: 75.49%
Epoch: 05 | Epoch Time: 0m 3s
	Train Loss: 0.425 | Train Acc: 80.76%
	 Val. Loss: 0.497 |  Val. Acc: 76.03%
Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 0.388 | Train Acc: 83.22%
	 Val. Loss: 0.512 |  Val. Acc: 75.69%
Epoch: 07 | Epoch Time: 0m 3s
	Train Loss: 0.363 | Train Acc: 84.77%
	 Val. Loss: 0.527 |  Val. Acc: 75.31%
Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 0.334 | Train Acc: 86.30%
	 Val. Loss: 0.550 |  Val. Acc: 74.58%
Epoch: 09 | Epoch Time: 0m 3s
	Train Loss: 0.305 | Train Acc: 87.53%
	 Val. Loss: 0.583 |  Val. Acc: 74.73%
Epoch: 10 | Epoch Time: 0m 3