In [1]:
import sys
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import spacy
import regex as re
import time
from tqdm import tqdm
from torchtext import vocab

import data

In [2]:
# Constants - Add here as you wish
N_EPOCHS = 5
EMBEDDING_DIM = 200

TRAIN_FILE = '../data/sent140.train.mini.csv'
DEV_FILE   = '../data/sent140.dev.csv'
TEST_FILE  = '../data/sent140.test.csv'

TRAIN_BS = 32
DEV_BS   = 32
TEST_BS  = 32
LR = 0.001

In [3]:
# Auxilary functions for data preparation
tok = spacy.load('en_core_web_sm',disable=['parser', 'tagger', 'ner'])
def tokenizer(s): 
    return [w.text.lower() for w in tok(tweet_clean(s))]

def tweet_clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
    return text.strip()

In [4]:
# Evaluation functions
def evaluate(model, loader, criterion):    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval() # set model for evaluation
    with torch.no_grad():
        for batch in loader:
            #WRITE CODE HERE
            inputs, labels = batch['inputs'], batch['labels'] # labels=torch.Size([32])
            outputs = model(inputs) #torch.Size([1, 32, 2])
            outputs=torch.squeeze(outputs) # remove dim=1
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            pass

    return epoch_loss / len(loader), epoch_acc / len(loader)

In [5]:
# Utility
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [6]:
# Recurrent Network
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        #WRITE CODE HERE
        
        # Embedding layer:
        self.emb = nn.Embedding.from_pretrained(glove_embeddings)
        
        # Recurrent layer:
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim) 
        
        # LSTM:
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # Fully connected layer:
        self.fc = nn.Linear(hidden_dim, output_dim)


    def forward(self, inputs, lengths=None):
        #WRITE CODE HERE
        inputs = self.emb(torch.tensor(inputs))
        output, hidden = self.rnn(inputs)
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        out = self.fc(hidden)
        
        return out 

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [8]:
# Preparation dataset

train_loader, dev_loader, test_loader, glove_embeddings = data.get_dataset(
                tokenizer,
                TRAIN_FILE,
                DEV_FILE,
                TEST_FILE,
                TRAIN_BS,
                DEV_BS,
                TEST_BS,
                EMBEDDING_DIM)

Loading ../data/sent140.train.mini.csv
Loading ../data/sent140.dev.csv
Loading ../data/sent140.test.csv
Getting datasets
Getting loaders


In [9]:
# Initialize model, optimizer and loss function
# hidden_dim = number of features in RNN layer
# output_dim = number of classes = 2 (Negative vs. Positive)

model = RNN(embedding_dim=EMBEDDING_DIM, hidden_dim=2, output_dim=2)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss() # Takes logits as input (raw network output)

In [10]:
model

RNN(
  (emb): Embedding(1193515, 200)
  (rnn): RNN(200, 2)
  (fc): Linear(in_features=2, out_features=2, bias=True)
)

In [11]:
# --- Train Loop ---

print('Training')
for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch}')
    start_time = time.time()
    epoch_loss = 0
    epoch_acc = 0
    correct = 0  
    
    for batch in train_loader:
        #WRITE CODE HERE
        inputs, labels = batch['inputs'], batch['labels'] # labels=torch.Size([32])
        optimizer.zero_grad()
        outputs = model(inputs) #torch.Size([1, 32, 2])
        outputs=torch.squeeze(outputs) # remove dim=1
        
        """
        Tried to implement accuracy calculation method, but got some errors about tensor sizes:
        
        probabilities = F.softmax(outputs)
        predictions = probs.argmax(dim=1)
        epoch_acc += torch.sum(predictions == labels).item()
        """
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        pass
    
    
    
    train_loss, train_acc = (epoch_loss / len(train_loader), epoch_acc / len(train_loader)) 
    valid_loss, valid_acc = evaluate(model, dev_loader, criterion)
            
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    

Training
Epoch 0


  inputs = self.emb(torch.tensor(inputs))


Epoch: 01 | Epoch Time: 2m 19s
	Train Loss: 0.709 | Train Acc: 0.00%
	 Val. Loss: 0.697 |  Val. Acc: 0.00%
Epoch 1
Epoch: 02 | Epoch Time: 2m 10s
	Train Loss: 0.696 | Train Acc: 0.00%
	 Val. Loss: 0.696 |  Val. Acc: 0.00%
Epoch 2
Epoch: 03 | Epoch Time: 2m 12s
	Train Loss: 0.695 | Train Acc: 0.00%
	 Val. Loss: 0.696 |  Val. Acc: 0.00%
Epoch 3
Epoch: 04 | Epoch Time: 2m 10s
	Train Loss: 0.694 | Train Acc: 0.00%
	 Val. Loss: 0.695 |  Val. Acc: 0.00%
Epoch 4
Epoch: 05 | Epoch Time: 2m 14s
	Train Loss: 0.694 | Train Acc: 0.00%
	 Val. Loss: 0.695 |  Val. Acc: 0.00%


In [13]:
# --- Test model ---
start_time = time.time()
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'\tTest Loss: ', test_loss)
end_time = time.time()
print('Computing time (s): ', end_time - start_time)

  inputs = self.emb(torch.tensor(inputs))


	Test Loss:  0.6952377961466487
Computing time (s):  66.46612405776978
