In [1]:
# -*- coding: utf-8 -*-
"""
   Introduction to Deep Learning (LDA-T3114)
   Skeleton code for Assignment 2: Sentiment Classification on a Feed-Forward Neural Network using Pretrained Embeddings
   Feel free to change this code according to your design!
   Remember to use PyTorch for your NN implementation.

   Hande Celikkanat & Miikka Silfverberg
"""

'\n   Introduction to Deep Learning (LDA-T3114)\n   Skeleton code for Assignment 2: Sentiment Classification on a Feed-Forward Neural Network using Pretrained Embeddings\n   Feel free to change this code according to your design!\n   Remember to use PyTorch for your NN implementation.\n\n   Hande Celikkanat & Miikka Silfverberg\n'

In [84]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gensim
import os

In [85]:
# Add the path to these data manipulation files if necessary:
# import sys
# sys.path.append('</PATH/TO/DATA/MANIP/FILES>')
from data_semeval import *
from paths import data_dir, model_dir

In [86]:
# name of the embeddings file to use
# Alternatively, you can also use the text file GoogleNews-pruned2tweets.txt (from Moodle),
# or the full set, wiz. GoogleNews-vectors-negative300.bin (from https://code.google.com/archive/p/word2vec/) 
embeddings_file = 'GoogleNews-pruned2tweets.bin'

In [267]:
#--- hyperparameters ---

# Feel free to experiment with different hyperparameters to see how they compare! 
# You can turn in your assignment with the best settings you find.

n_classes = len(LABEL_INDICES)
n_epochs = 30 
learning_rate = 0.001
report_every = 1
verbose = False

In [88]:
#--- auxilary functions ---

# To convert string label to pytorch format:
def label_to_idx(label):
    return torch.LongTensor([LABEL_INDICES[label]])

In [263]:
#--- model ---

class FFNN(nn.Module):
    # Feel free to add whichever arguments you like here.
    # Note that pretrained_embeds is a numpy matrix of shape (num_embeddings, embedding_dim)
    def __init__(self, pretrained_embeds, n_classes):
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(len(pretrained_embeds),len(pretrained_embeds[0]))
        self.linear1 = nn.Linear(len(pretrained_embeds[0]),200)
        self.linear2 = nn.Linear(200, n_classes)
        

        

    def forward(self, x):
        # WRITE CODE HERE
        out = F.relu(self.linear1(x).view(1,-1))
        out = F.relu(self.linear2(out))
       
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

       
        

In [219]:
#--- data loading ---
data = read_semeval_datasets(data_dir)
gensim_embeds = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(model_dir, embeddings_file), binary=True)
pretrained_embeds = gensim_embeds.vectors
# To convert words in the input tweet to indices of the embeddings matrix:
word_to_idx = {word: i for i, word in enumerate(gensim_embeds.vocab.keys())}

In [268]:
#--- set up ---
# WRITE CODE HERE

model = FFNN(pretrained_embeds, n_classes)
loss_function = torch.nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [221]:
def indices_to_vectors(indices,pretrained_embeds):
    word_vectors = pretrained_embeds[indices]
    return word_vectors

def average_vectors(word_vectors):
    sumvector = np.ones((1,300))
    distributor = word_vectors.shape[0]
    for pretrained_vector in word_vectors:
        sumvector += pretrained_vector
    cbow_vector = sumvector
    return cbow_vector

In [222]:
def tweet_to_CBOW(data,pretrained_embeds,word_to_idx):
    for split in ["training","test.gold"]:#,"development.input","development.gold","test.input","test.gold]:
        for tweet in data[split]:
            text = tweet["BODY"]
            indices = [word_to_idx[w] for w in text if w in word_to_idx]
            vectors = indices_to_vectors(indices,pretrained_embeds)
            tweet["CBOW"] = average_vectors(vectors)

tweet_to_CBOW(data,pretrained_embeds,word_to_idx)

In [269]:
#--- training ---
for epoch in range(n_epochs):
    total_loss = 0
    for tweet in data['training']:  
        gold_class = label_to_idx(tweet['SENTIMENT']) #real class
        gold_class = torch.tensor(gold_class,dtype=torch.long)

        # WRITE CODE HERE
        
        text = tweet["CBOW"]
        text = torch.tensor(text,dtype=torch.float)
        optimizer.zero_grad()
        log_probs = model(text) #predict
        loss = loss_function(log_probs, gold_class) #count loss
        loss.backward()
        optimizer.step()

        total_loss += loss

        
    if ((epoch+1) % report_every) == 0:
        print('epoch: %d, loss: %.4f' % (epoch, total_loss*100/len(data['training'])))
    
# Feel free to use the development data to tune hyperparameters if you like!

  


epoch: 0, loss: 108.7544
epoch: 1, loss: 87.8302
epoch: 2, loss: 81.6534
epoch: 3, loss: 79.7272
epoch: 4, loss: 78.1084
epoch: 5, loss: 76.3368
epoch: 6, loss: 74.5814
epoch: 7, loss: 73.8837
epoch: 8, loss: 72.8829
epoch: 9, loss: 73.1382
epoch: 10, loss: 70.8475
epoch: 11, loss: 70.2245
epoch: 12, loss: 70.4095
epoch: 13, loss: 68.6214
epoch: 14, loss: 68.2592
epoch: 15, loss: 67.9911
epoch: 16, loss: 65.9797
epoch: 17, loss: 65.7192
epoch: 18, loss: 65.6712
epoch: 19, loss: 63.7827
epoch: 20, loss: 64.5290
epoch: 21, loss: 61.8365
epoch: 22, loss: 61.2123
epoch: 23, loss: 60.4831
epoch: 24, loss: 60.2740
epoch: 25, loss: 59.4426
epoch: 26, loss: 58.9275
epoch: 27, loss: 58.7758
epoch: 28, loss: 56.8457
epoch: 29, loss: 56.9683


In [270]:
correct = 0
with torch.no_grad():
    for tweet in data['test.gold']:
        gold_class = label_to_idx(tweet['SENTIMENT'])
        # WRITE CODE HERE
        cbow = torch.tensor(tweet["CBOW"], dtype=torch.float)
        log_probs = model(cbow)
        _,predicted = torch.max(log_probs.data,1)
        
        correct += torch.eq(predicted,gold_class).item()

        
        if verbose:
            print('TEST DATA: %s, OUTPUT: %s, GOLD LABEL: %d' % (tweet['BODY'], tweet['SENTIMENT'], predicted))
        
    print('test accuracy: %.2f' % (100.0 * correct / len(data['test.gold'])))

test accuracy: 62.72


## I played around with different parameters. I got a result around 65% and the most of the times in my experiments the result got lower. 