## Irony Detection in English Tweets with Neural Network ##

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import demoji
#demoji.download_codes()
from nltk.tokenize import TweetTokenizer
from argparse import Namespace
from collections import Counter
import json
import os
import string
import re

### Task A: Binary Classfication of tweets as Ironic/Non-Ironic ###

#### Data Preparation ####

In [None]:
'''
1) Replace emojis with :text:
2) Replace @user
3) Replace url
'''

In [2]:
taskA_train_file = "./data/taskA/train_emoji.txt"
taskA_test_file = "./data/taskA/test_emoji.txt"

In [29]:
with open(taskA_train_file, 'r') as f:
    train_lines = [line.split('\t') for line in f.readlines()][1:]
assert len(train_lines) == 3834
with open(taskA_test_file, 'r') as f:
    test_lines = [line.split('\t') for line in f.readlines()][1:]
assert len(test_lines) == 784

In [33]:
taskA_train = pd.DataFrame(train_lines, columns = ['Tweet index', 'label', 'Tweet text'])
taskA_test = pd.DataFrame(test_lines, columns = ['Tweet index', 'label', 'Tweet text'])

In [34]:
# display full cell of dataframe
pd.set_option('display.max_colwidth', -1)

In [35]:
taskA_train.head(1)

Unnamed: 0,Tweet index,label,Tweet text
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR\n


In [36]:
taskA_test.head(1)

Unnamed: 0,Tweet index,label,Tweet text
0,1,0,@Callisto1947 Can U Help?||More conservatives needed on #TSU + get paid 4 posting stuff like this!||YOU $ can go to http://t.co/JUmMWi0AyT\n


In [37]:
# tokenize
tokenizer = TweetTokenizer()
# test
tweet = "I asked God to protect me from my enemies .. shortly after I started losing friends 😳💯  or #naah"
print(tokenizer.tokenize(tweet))

['I', 'asked', 'God', 'to', 'protect', 'me', 'from', 'my', 'enemies', '..', 'shortly', 'after', 'I', 'started', 'losing', 'friends', '😳', '💯', 'or', '#naah']


In [38]:
# get full vocab for training data
vocab = set(tokenizer.tokenize(' '.join(taskA_train['Tweet text'])))
# sort vocab
vocab = sorted(vocab)
print("Vocab size: ", len(vocab))
#print(vocab)

Vocab size:  15756


In [39]:
# inspect vocabulary
# urls, hashtag, usertag, number, punctuation
urls = [token for token in vocab if 'http' in token]
print("Number of urls: {}".format(len(urls)))
vocab_tmp = list(set(vocab) - set(urls))
# hashtags
hashtags = [token for token in vocab_tmp if '#' in token]
print("Number of hashtags: {}".format(len(hashtags)))
vocab_tmp = list(set(vocab_tmp) - set(hashtags))
# emojis
emojis = demoji.findall(' '.join(vocab_tmp))
print("Number of emojis: {}".format(len(emojis)))
#print(emojis)
vocab_tmp = list(set(vocab_tmp) - set(emojis))
usertag = [token for token in vocab_tmp if '@' in token]
print("Number of usertag: {}".format(len(usertag)))
vocab_tmp = list(set(vocab_tmp) - set(usertag))
# numbers 
alphanumeric = vocab_tmp
print("Number of alphanumeric and punctuations: {}".format(len(alphanumeric)))

Number of urls: 917
Number of hashtags: 2930
Number of emojis: 158
Number of usertag: 1987
Number of alphanumeric and punctuations: 9764


In [40]:
'''
All urls will take the [URL] token
All usertag will take the [USER] token
All emojis will be translated to text surrounded by :
    examples, 💯 will be :hundred points:
alphanumeric and puntuations will be left as they are
'''
# preprocess function that make the above adjustments to tweet text
def preprocess(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    emojis = demoji.findall(text)
    cleaned = []
    for token in tokens:
        if 'http' in token:
            cleaned.append('[URL]')
        elif '@' in token:
            cleaned.append('[USER]')
        elif token in emojis:
            cleaned.append(':' + ''.join(emojis[token].split()) + ':')
        else:
            cleaned.append(token.lower())
    return ' '.join(cleaned)

In [41]:
cleaned_train = taskA_train['Tweet text'].map(preprocess)
cleaned_test = taskA_test['Tweet text'].map(preprocess)

In [42]:
taskA_train['Tweet text'] = cleaned_train
taskA_test['Tweet text'] = cleaned_test
taskA_train.to_csv('./preprocessed/taskA/train.csv', header = True, index = False)
taskA_test.to_csv('./preprocessed/taskA/test.csv', header = True, index = False)


In [43]:
# write vocab file
vocab = set(tokenizer.tokenize(' '.join(taskA_train['Tweet text'])))
# sort vocab
vocab = sorted(vocab)
print("Vocab size: ", len(vocab))
#print(vocab)
with open('./preprocessed/taskA/vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

Vocab size:  11155


In [44]:
# Replace Emojis with text #
# test
#tweet = "@TargetZonePT 😡 no he bloody isn't I was upstairs getting changed !"
tweet = "I asked God to protect me from my enemies .. shortly after I started losing friends 😳💯  or #naah"
demoji.findall(tweet)

{'😳': 'flushed face', '💯': 'hundred points'}

#### Model: Classifying Tweets as Ironic/Non-Ironic with a Neural Network ####

In [25]:
'''
Reference: 
Natural Language Processing with PyTorch
Delip Rao & Brian McMahan
'''
df = pd.read_csv("./preprocessed/taskA/train.csv")
df.columns

Index(['Tweet index', 'label', 'Tweet text'], dtype='object')

In [58]:
def get_train_val_split(infile, seed = 1):
    folds = {}
    random.seed(seed)
    # create a dictionary partition
    # partition['train']: list of training IDs
    # partition['validation']: list of validation IDs
    length = len(pd.read_csv(infile))
    indices = list(range(length))
    #random.shuffle(indices)
    one_fold = length // 10
    for i in range(1, 11):
        curr_dict = {}
        curr_dict['validation'] = indices[one_fold * (i - 1) : one_fold * i]
        curr_dict['train'] = list(set(indices) - set(curr_dict['validation']))
        folds[i] = curr_dict
    return folds

In [69]:
vocab = {}
def build_vocab(infile, fold, vocab_size = 10000):
    global vocab
    freq_dict = {}
    df = pd.read_csv(infile)
    df = df.iloc[fold['train'],:]
    # concat all tweets into one string
    tweets_string = ' '.join(df['Tweet text'])
    # remove punctuation, convert to lower case
    processed_tweets_string = re.sub(r'[^\w\s]', ' ', tweets_string).lower()
    # split into tokens
    tokens = processed_tweets_string.split()
    # create count dictionary freq_dict
    for token in tokens:
        if token in freq_dict:
            freq_dict[token] += 1
        else:
            freq_dict[token] = 1
    # sort dictionary in descending freq count
    sorted_freq = sorted(freq_dict.items(), key = lambda x:x[1], reverse = True)
    # take top vocab_size - 1 vocab, accounting for [UNK] token
    pruned_vocab = sorted_freq[:vocab_size - 1]
    # add to vocab
    vocab['[UNK]'] = 0
    for i, token_tuple in enumerate(pruned_vocab):
        vocab[token_tuple[0]] = i + 1
    return

In [72]:
def vectorizer(infile, vocab_size = 10000):
    vectors = {}
    labels = {}
    df = pd.read_csv(infile)
    for index, row in df.iterrows():
        vec = [0] * vocab_size
        tokens = list(set(re.sub(r'[^\w\s]', ' ', row['Tweet text']).lower().split()))
        for token in tokens:
            if token in vocab:
                vec[vocab[token]] = 1
            else:
                vec[0] = 1
        vectors[index] = vec
        labels[index] = int(row['label'])
    return (vectors, labels)

In [80]:
import torch
from torch.utils import data
class Dataset(data.Dataset):
    def __init__(self, list_IDs, labels):
        self.labels = labels
        self.list_IDs = list_IDs
    
    def __len__(self):
        return len(self.list_IDs)
    
    def __getitem__(self, index):
        # generates one sample of data
        ID = self.list_IDs[index]
        X = torch.tensor(vectors[ID])
        y = self.labels[ID]
        return X, y

In [103]:
''' Model '''
class IronyClassifier(nn.Module):
    ''' A 2-layer Multilayer Perceptron for classifying tweets '''
    def __init__(self, input_dim, hidden_dim, output_dim):
        '''
        Args:
            input_dim (int): size of input vector(size of vocab)
            hidden_dim (int): output size of the first linear layer
            output_dim (int): output size of the second linear layer(number of classes)
        '''
        super(IronyClassifier, self).__init__()
        # fully connected layer 1
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    ''' Forward propogation of the classifier '''    
    def forward(self, x_in, apply_softmax = False):
        '''
        Args: 
            x_in (torch.Tensor): an input data tensor
                x_in.shape should be (batch, input_dim)
            apply_softmax (bool): a flag for softmax activation
                should be false if using Cross Entropy Loss
        Returns:
            result tensor. tensor.shape should be (batch, output_dim)
        '''
        intermediate_vector = F.relu(self.fc1(x_in))
        prediction_vector = self.fc2(intermediate_vector)
        
        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim = 1)
            
        return prediction_vector
    

In [91]:
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [44]:
data_file = "./preprocessed/taskA/train.csv"

In [107]:
args = Namespace(
    # Training  hyper parameters
    vocab_size = 10000,
    num_epochs = 5,
    hidden_dim = 300,
    learning_rate = 0.01,
    batch_size = 16,
    seed = 1337,
)

In [108]:
# 10-fold cross validation
# get the indices for train and validation for each fold
global vectors, labels
folds = get_train_val_split(data_file)
fold = 1
partition = folds[fold]
# create vocab
build_vocab(data_file, partition, vocab_size = args.vocab_size)
# create vectors and labels
vectors, labels = vectorizer(data_file, vocab_size = args.vocab_size)
# training loop

training_set = Dataset(partition['train'], labels)
training_generator = data.DataLoader(training_set, batch_size = args.batch_size, shuffle = True)

validation_set = Dataset(partition['validation'], labels)
validation_generator = data.DataLoader(validation_set, batch_size = args.batch_size, shuffle = True)


In [109]:
# training loop
device = 'cpu'
classifier = IronyClassifier(input_dim = args.vocab_size, hidden_dim = args.hidden_dim, output_dim = 2)
classifier = classifier.to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rate)

for epoch in range(args.num_epochs):
    # training
    running_loss = 0.0
    running_acc = 0.0
    classifier = classifier.float()
    classifier.train()
    batch_index = 0
    for local_batch, local_labels in training_generator:
        # clear gradients
        optimizer.zero_grad()
        # compute output
        y_pred = classifier(local_batch.float())
        # compute loss
        loss = loss_func(y_pred, local_labels)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
        # produce gradients
        loss.backward()
        # backpropogation
        optimizer.step()
        # compute accuracy
        acc_t = compute_accuracy(y_pred, local_labels)
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        batch_index += 1
    print("Epoch {}: ".format(epoch))
    print("  Loss: {}".format(running_loss))
    print("  Accuracy: {}".format(running_acc))

Epoch 0: 
  Loss: 0.6594616194014193
  Accuracy: 60.145728114478096
Epoch 1: 
  Loss: 0.310802695893303
  Accuracy: 86.66351010101009
Epoch 2: 
  Loss: 0.04322139117788606
  Accuracy: 98.81365740740742
Epoch 3: 
  Loss: 0.005862495930513331
  Accuracy: 99.85532407407409
Epoch 4: 
  Loss: 0.001837342329913048
  Accuracy: 99.97106481481475


In [111]:
# evaluate on validation set
running_loss = 0.0
running_acc = 0.0
classifier.eval()
batch_index = 0
for local_batch, local_labels in validation_generator:
    # get prediction
    y_pred =  classifier(local_batch.float())
    loss = loss_func(y_pred, local_labels)
    loss_t = loss.to("cpu").item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)
    # compute accuracy
    acc_t = compute_accuracy(y_pred, local_labels)
    running_acc += (acc_t - running_acc) / (batch_index + 1)
    batch_index += 1
print(running_acc)    

62.96875


In [None]:
''' Hyperparameter Tuning '''
'''
vocab size
number of epochs
size of hidden layer
learning rate
'''