## Irony Detection in English Tweets with Neural Network ##

In [56]:
import pandas as pd
import demoji
#demoji.download_codes()
from nltk.tokenize import TweetTokenizer

### Task A: Binary Classfication of tweets as Ironic/Non-Ironic ###

#### Data Preparation ####

In [None]:
'''
1) Replace emojis with :text:
2) Replace @user
3) Replace url
'''

In [163]:
taskA_train_file = "./data/taskA/train_emoji.txt"
taskA_test_file = "./data/taskA/test_emoji.txt"

In [164]:
taskA_train = pd.read_csv(taskA_train_file, sep = '\t')
taskA_test = pd.read_csv(taskA_test_file, sep = '\t')

In [165]:
# display full cell of dataframe
pd.set_option('display.max_colwidth', -1)

In [166]:
taskA_train.head(1)

Unnamed: 0,Tweet index,Label,Tweet text
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR


In [167]:
taskA_test.head(1)

Unnamed: 0,Tweet index,Label,Tweet text
0,1,0,@Callisto1947 Can U Help?||More conservatives needed on #TSU + get paid 4 posting stuff like this!||YOU $ can go to http://t.co/JUmMWi0AyT


In [168]:
# tokenize
tokenizer = TweetTokenizer()
# test
tweet = "I asked God to protect me from my enemies .. shortly after I started losing friends 😳💯  or #naah"
print(tokenizer.tokenize(tweet))

['I', 'asked', 'God', 'to', 'protect', 'me', 'from', 'my', 'enemies', '..', 'shortly', 'after', 'I', 'started', 'losing', 'friends', '😳', '💯', 'or', '#naah']


In [169]:
# get full vocab for training data
vocab = set(tokenizer.tokenize(' '.join(taskA_train['Tweet text'])))
# sort vocab
vocab = sorted(vocab)
print("Vocab size: ", len(vocab))
#print(vocab)

Vocab size:  15772


In [170]:
# inspect vocabulary
# urls, hashtag, usertag, number, punctuation
urls = [token for token in vocab if 'http' in token]
print("Number of urls: {}".format(len(urls)))
vocab_tmp = list(set(vocab) - set(urls))
# hashtags
hashtags = [token for token in vocab_tmp if '#' in token]
print("Number of hashtags: {}".format(len(hashtags)))
vocab_tmp = list(set(vocab_tmp) - set(hashtags))
# emojis
emojis = demoji.findall(' '.join(vocab_tmp))
print("Number of emojis: {}".format(len(emojis)))
#print(emojis)
vocab_tmp = list(set(vocab_tmp) - set(emojis))
usertag = [token for token in vocab_tmp if '@' in token]
print("Number of usertag: {}".format(len(usertag)))
vocab_tmp = list(set(vocab_tmp) - set(usertag))
# numbers 
alphanumeric = vocab_tmp
print("Number of alphanumeric and punctuations: {}".format(len(alphanumeric)))

Number of urls: 917
Number of hashtags: 2930
Number of emojis: 158
Number of usertag: 1987
Number of alphanumeric and punctuations: 9780


In [171]:
'''
All urls will take the [URL] token
All usertag will take the [USER] token
All emojis will be translated to text surrounded by :
    examples, 💯 will be :hundred points:
alphanumeric and puntuations will be left as they are
'''
# preprocess function that make the above adjustments to tweet text
def preprocess(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    emojis = demoji.findall(text)
    cleaned = []
    for token in tokens:
        if 'http' in token:
            cleaned.append('[URL]')
        elif '@' in token:
            cleaned.append('[USER]')
        elif token in emojis:
            cleaned.append(':' + ''.join(emojis[token].split()) + ':')
        else:
            cleaned.append(token.lower())
    return ' '.join(cleaned)

In [172]:
cleaned_train = taskA_train['Tweet text'].map(preprocess)
cleaned_test = taskA_test['Tweet text'].map(preprocess)

In [176]:
taskA_train['Tweet text'] = cleaned_train
taskA_test['Tweet text'] = cleaned_test
taskA_train.to_csv('./preprocessed/taskA/train.csv', header = True, index = False)
taskA_test.to_csv('./preprocessed/taskA/test.csv', header = True, index = False)


In [177]:
# write vocab file
vocab = set(tokenizer.tokenize(' '.join(taskA_train['Tweet text'])))
# sort vocab
vocab = sorted(vocab)
print("Vocab size: ", len(vocab))
#print(vocab)
with open('./preprocessed/taskA/vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

Vocab size:  11172


In [55]:
# Replace Emojis with text #
# test
#tweet = "@TargetZonePT 😡 no he bloody isn't I was upstairs getting changed !"
tweet = "I asked God to protect me from my enemies .. shortly after I started losing friends 😳💯  or #naah"
demoji.findall(tweet)

{'💯': 'hundred points', '😳': 'flushed face'}