### Libraries

In [30]:
import csv
import nltk
import os
import pandas as pd
import re
import time

from autocorrect import spell
from hashtag_separator import infer_spaces
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from textblob.classifiers import NaiveBayesClassifier

### Data import

In [48]:
DATA_PATH = os.path.join('..', 'data')
OUTPUT_PATH = os.path.join('..', 'output')

POS_TRAIN_PATH = os.path.join(DATA_PATH, 'dataset', 'pos_train.txt')
NEG_TRAIN_PATH = os.path.join(DATA_PATH, 'dataset', 'neg_train.txt')

POS_WORDS_PATH = os.path.join(DATA_PATH, 'sentiment', 'positive-words.txt')
NEG_WORDS_PATH = os.path.join(DATA_PATH, 'sentiment', 'negative-words.txt')

with open(POS_TRAIN_PATH, 'r') as f:
    pos_data = f.read().splitlines()
with open(NEG_TRAIN_PATH, 'r') as f:
    neg_data = f.read().splitlines()

stopwords_eng = stopwords.words('english')

# Credit: https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107
with open(POS_WORDS_PATH, 'r') as f:
    pos_words = f.read().splitlines()
with open(NEG_WORDS_PATH, 'r') as f:
    neg_words = f.read().splitlines()
    
lemmatizer = nltk.stem.WordNetLemmatizer()

### Tweet preprocessing

In [42]:
def remove_stopwords(words):
    return list(filter(lambda x: x not in stopwords_eng, words))

def is_number(word):
    original_word = word
    special_chars = ['.', ',', '/', '%', '-']
    for char in special_chars:
        word = word.replace(char, '')
    if word.isdigit():
        return '<number>'
    else:
        return original_word
    
def remove_numbers(words):
    return list(map(is_number, words))

def replace_hashtags(words):
    new_words = []
    
    for word in words:
        if word and word[0]!='#':
            new_words.append(word)
            continue
        
        new_words.append('<hashtag>')
        for hash_word in infer_spaces(word[1:]).split(' '):
            new_words.append(hash_word)
    
    return new_words

# Credit: https://stackoverflow.com/questions/10072744/remove-repeating-characters-from-words
def emphasize(word):
    if (word[:2]=='..'):
        return '...'
    
    new_word = re.sub(r'(.)\1{2,}', r'\1', word)
    if len(new_word) != len(word):
        return '<<' + spell(new_word) + '>>'
    else:
        return word

def emphasize_words(words):
    return list(map(emphasize, words))

def lemmatize(word):
    try:
        temp = lemmatizer.lemmatize(word).lower()
        return temp
    except:
        return word

def normalize_words(words):
    return list(map(lemmatize, words))

def infer_sentiment(words):
    new_words = []
    
    for word in words:
        if (word[:2]=='<<'):
            check_word = word[2:-2]
        else:
            check_word = word
        
        if check_word in pos_words:
            new_words += ['positive', word]
        elif check_word in neg_words:
            new_words += ['negative', word]
        else:
            new_words.append(word)
    
    return new_words

def emphasize_punctiation(words):
    special_chars = ['!', '?']
    i = 1
    
    while (i<len(words)):
        word1 = words[i-1]
        word2 = words[i]
        if (word1 in special_chars and word2 in special_chars):
            start = i-1
            while (i+1<len(words) and words[i+1] in special_chars):
                i += 1
            words = words[:start] + ['<emphasis>'] + words[i+1:]
            i = start
        
        i += 1
    
    return words

In [46]:
def parse_tweet(t):
    words = t.split(' ')
    words = remove_stopwords(words)
    words = remove_numbers(words)
    words = replace_hashtags(words)
    words = emphasize_words(words)
    words = normalize_words(words)
    words = infer_sentiment(words)
    words = emphasize_punctiation(words)
    tweet = ' '.join(words)
    return tweet

def parse_data(data):
    parsed = []
    start_time = time.time()
    
    for i, t in enumerate(data):
        if (i+1)%5000==0:
            print(str(i+1)+'/'+str(len(data)), time.time()-start_time)
        parsed.append(parse_tweet(t))
    
    print('Total time (s): ' + str(time.time()-start_time))
    return parsed

In [47]:
parsed_pos = parse_data(pos_data[0:5000])
parsed_neg = parse_data(neg_data[0:5000])

5000/5000 6.638222932815552
Total time (s): 6.638969898223877
5000/5000 6.084514141082764
Total time (s): 6.0858073234558105


In [51]:
pos_df = pd.DataFrame(parsed_pos)
pos_df.columns = ['text']
pos_df['label'] = 'pos'

neg_df = pd.DataFrame(parsed_neg)
neg_df.columns = ['text']
neg_df['label'] = 'neg'

df = pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

PARSED_TWEETS_PATH = os.path.join(OUTPUT_PATH, 'parsed_tweets.csv')
df.to_csv(PARSED_TWEETS_PATH, index=False)

In [36]:
with open('parsed_test.csv') as f:
    cl = NaiveBayesClassifier(f, format='csv')

In [37]:
cl.classify('I suck dick')

'pos'

In [38]:
with open('./test_data.txt', 'r') as f:
    test_data = f.read().splitlines()

In [48]:
cl_data = []
start_time = time.time()
for i, l in enumerate(test_data):
    if (i>0 and i%500==0):
        print(str(i)+'/'+str(len(test_data)), time.time()-start_time)
    l = l.split(',', 1)
    id_ = l[0]
    tweet = l[1]
    cl_data.append([id_, cl.classify(tweet)])
print(time.time()-start_time)
df = pd.DataFrame(cl_data)
df.columns = ['Id', 'Prediction']

100/10000 7.3515729904174805
200/10000 14.485529899597168
300/10000 22.302922010421753
400/10000 29.161931037902832
500/10000 36.11663603782654
600/10000 43.951600074768066
700/10000 51.47758078575134
800/10000 58.543980836868286
900/10000 65.5224220752716
1000/10000 72.61669516563416
1100/10000 79.66771697998047
1200/10000 86.70493912696838
1300/10000 93.74563312530518
1400/10000 100.66223812103271
1500/10000 107.4223120212555
1600/10000 114.47362279891968
1700/10000 121.5224449634552
1800/10000 128.64934515953064
1900/10000 135.6797730922699
2000/10000 142.72301506996155
2100/10000 150.5981569290161
2200/10000 157.54157304763794
2300/10000 165.45492315292358
2400/10000 174.4098241329193
2500/10000 181.92578792572021
2600/10000 189.03197503089905
2700/10000 197.71762108802795
2800/10000 205.74543809890747
2900/10000 213.60946488380432
3000/10000 224.17110681533813
3100/10000 232.73848605155945
3200/10000 240.20346093177795
3300/10000 248.08762311935425
3400/10000 255.29530882835388
35

In [54]:
df['Prediction'] = df['Prediction'].apply(lambda x : 1 if x == 'pos' else -1)

In [55]:
df.to_csv('submission.csv', index=False)

In [None]:
def classify(model, test_data):
    cl_data = []
    for l in test_data:
        l = l.split(',', 1)
        id_ = l[0]
        tweet = l[1]
        cl_data.append([id_, model.classify(tweet)])
    df = pd.DataFrame(cl_data)
    df.columns = ['Id', 'Prediction']
    df.to_csv('submission.csv', index=False)

In [22]:
t = '<user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n'
print('original:\n', t)
t = t.replace('\n', '')
words = t.split(' ')
print(words)

words = removeNumbers(words)
words = replaceHashtags(words)
words = emphasizeWords(words)
words = emphasizePunctiation(words)
words = removeStopwords(words)
words = normalizeWords(words)
words = checkPositiveNegative(words)
print(words)

t = ' '.join(words)
print('\nafter:\n', t)

original:
 <user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

['<user>', 'i', '....', 'this', 'sucks', '?', '!', '!', 'haaappyyyyy', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
['<user>', '...', 'negatively', 'suck', '<emphasis>', 'positively', '<<happy>>', 'dunno', 'justin', 'read', 'mention', '.', 'justin', 'god', 'know', ',', 'hope', 'follow', '<hashtag>', 'believe', '<number>']

after:
 <user> ... negatively suck <emphasis> positively <<happy>> dunno justin read mention . justin god know , hope follow <hashtag> believe <number>
