### Libraries

In [1]:
import csv
import nltk
import os
import pandas as pd
import re
import time

from autocorrect import spell
from hashtag_separator import infer_spaces
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

### Data import

In [2]:
# Folder paths
DATA_PATH = os.path.join('..', 'data')
OUTPUT_PATH = os.path.join('..', 'output')

# Training data set paths
POS_TRAIN_PATH = os.path.join(DATA_PATH, 'dataset', 'pos_train.txt')
NEG_TRAIN_PATH = os.path.join(DATA_PATH, 'dataset', 'neg_train.txt')
POS_TRAIN_FULL_PATH = os.path.join(DATA_PATH, 'dataset', 'train_pos_full.txt')
NEG_TRAIN_FULL_PATH = os.path.join(DATA_PATH, 'dataset', 'train_neg_full.txt')

# Testing data set paths
TEST_PATH = os.path.join(DATA_PATH, 'dataset', 'test_data.txt')

# Sentiment corpus
POS_WORDS_PATH = os.path.join(DATA_PATH, 'sentiment', 'positive-words.txt')
NEG_WORDS_PATH = os.path.join(DATA_PATH, 'sentiment', 'negative-words.txt')

with open(POS_TRAIN_PATH, 'r') as f:
    pos_data = f.read().splitlines()
with open(NEG_TRAIN_PATH, 'r') as f:
    neg_data = f.read().splitlines()
with open(POS_TRAIN_FULL_PATH, 'r') as f:
    pos_full_data = f.read().splitlines()
with open(NEG_TRAIN_FULL_PATH, 'r') as f:
    neg_full_data = f.read().splitlines()
    
with open(TEST_PATH, 'r') as f:
    test_data = f.read().splitlines()

with open(POS_WORDS_PATH, 'r') as f:
    pos_words = f.read().splitlines()
with open(NEG_WORDS_PATH, 'r') as f:
    neg_words = f.read().splitlines()

stopwords_eng = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

### Tweet preprocessing

In [3]:
def remove_stopwords(words):
    return list(filter(lambda x: x not in stopwords_eng, words))

def is_number(word):
    original_word = word
    special_chars = ['.', ',', '/', '%', '-']
    for char in special_chars:
        word = word.replace(char, '')
    if word.isdigit():
        return ''
    else:
        return original_word
    
def remove_numbers(words):
    return list(map(is_number, words))

def replace_hashtags(words):
    new_words = []
    
    for word in words:
        if word and word[0]!='#':
            new_words.append(word)
            continue
        
        for hash_word in infer_spaces(word[1:]).split(' '):
            new_words.append(hash_word)
    
    return new_words

# Credit: https://stackoverflow.com/questions/10072744/remove-repeating-characters-from-words
def emphasize(word):
    if (word[:2]=='..'):
        return '...'
    
    new_word = re.sub(r'(.)\1{2,}', r'\1', word)
    if len(new_word) != len(word):
        return '<<' + spell(new_word) + '>>'
    else:
        return word

def emphasize_words(words):
    return list(map(emphasize, words))

def lemmatize(word):
    try:
        temp = lemmatizer.lemmatize(word).lower()
        return temp
    except:
        return word

def normalize_words(words):
    return list(map(lemmatize, words))

def infer_sentiment(words):
    new_words = []
    
    for word in words:
        if (word[:2]=='<<'):
            check_word = word[2:-2]
        else:
            check_word = word
        
        if check_word in pos_words:
            new_words += ['<<positive>>', word]
        elif check_word in neg_words:
            new_words += ['<<negative>>', word]
        else:
            new_words.append(word)
    
    return new_words

def emphasize_punctuation(words):
    special_chars = ['!', '?']
    i = 1
    
    while (i<len(words)):
        word1 = words[i-1]
        word2 = words[i]
        if (word1 in special_chars and word2 in special_chars):
            start = i-1
            while (i+1<len(words) and words[i+1] in special_chars):
                i += 1
            words = words[:start] + ['<<emphasis>>'] + words[i+1:]
            i = start
        
        i += 1
    
    return words

def replace_emoticons(tweet):
    emoticons = \
    [
     ('<<positive>>',[ ':-)', ':)', '(:', '(-:', \
                       ':-D', ':D', 'X-D', 'XD', 'xD', \
                       '<3', ':\*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
     ('<<negative>>', [':-(', ':(', '(:', '(-:', ':,(',\
                       ':\'(', ':"(', ':((', ] ),\
    ]

    def replace_parenth(arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
    
    def regex_join(arr):
        return '(' + '|'.join( arr ) + ')'

    emoticons_regex = [ (repl, re.compile(regex_join(replace_parenth(regx))) ) \
            for (repl, regx) in emoticons ]
    
    for (repl, regx) in emoticons_regex :
        tweet = re.sub(regx, ' '+repl+' ', tweet)
    
    return tweet

def lower(tweet):
    return tweet.lower()

In [4]:
def parse_tweet(t):
    t = lower(t)
    t = replace_emoticons(t)
    words = t.split(' ')
    words = remove_stopwords(words)
    words = remove_numbers(words)
    words = replace_hashtags(words)
    words = emphasize_words(words)
    words = normalize_words(words)
    words = infer_sentiment(words)
    words = emphasize_punctuation(words)
    tweet = ' '.join(words)
    return tweet

def parse_data(data):
    parsed = []
    start_time = time.time()
    length = len(data)
    
    for i, t in enumerate(data):
        if (i+1)%100000==0:
            print(str(i+1)+'/'+str(len(data)), time.time()-start_time)
        parsed.append(parse_tweet(t))
    
    print('Total time (s): ' + str(time.time()-start_time))
    return parsed

In [187]:
print('Positives: ' + str(len(pos_full_data)))
print('Negatives: ' + str(len(neg_full_data)))

start = time.time()
parsed_pos = parse_data(pos_full_data)
parsed_neg = parse_data(neg_full_data)
end = time.time()
print(end - start)

Positives: 1250000
Negatives: 1250000
100000/1250000 167.78906798362732
200000/1250000 310.06894183158875
300000/1250000 452.5985391139984
400000/1250000 585.5992059707642
500000/1250000 725.152116060257
600000/1250000 876.9700930118561
700000/1250000 1025.842563867569
800000/1250000 1166.373085975647
900000/1250000 1304.1353511810303
1000000/1250000 1447.7323589324951
1100000/1250000 1592.6861300468445
1200000/1250000 1732.7914249897003
Total time (s): 1797.9385318756104
100000/1250000 163.28157210350037
200000/1250000 319.22611021995544
300000/1250000 478.1678581237793
400000/1250000 634.2569048404694
500000/1250000 793.6341848373413
600000/1250000 949.1574530601501
700000/1250000 1098.9475939273834
800000/1250000 1253.512937784195
900000/1250000 1407.8736789226532
1000000/1250000 1562.4150609970093
1100000/1250000 1744.9414188861847
1200000/1250000 1911.5277769565582
Total time (s): 1990.4139518737793
3788.3559160232544


In [5]:
PARSED_TWEETS_PATH = os.path.join(OUTPUT_PATH, 'parsed_tweets.csv')

In [188]:
pos_df = pd.DataFrame(parsed_pos)
pos_df.columns = ['text']
pos_df['label'] = 'pos'

neg_df = pd.DataFrame(parsed_neg)
neg_df.columns = ['text']
neg_df['label'] = 'neg'

df = pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

df.to_csv(PARSED_TWEETS_PATH, index=False)

In [6]:
df = pd.read_csv(PARSED_TWEETS_PATH)

In [7]:
tweets_flat = df.as_matrix(['text']).astype(str).flatten()
tweets_sentiment_flat = df.as_matrix(['label']).flatten()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(tweets_flat, tweets_sentiment_flat, test_size=0.20, random_state=42)

## Classification

In [8]:
y_tweets = list(map(lambda x: 1 if x == 'pos' else -1, tweets_sentiment_flat))

In [9]:
from sklearn.neural_network import MLPClassifier

nn_pipe = Pipeline([
    ('vec', TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True)), 
    ('nn', MLPClassifier(solver='lbfgs', alpha=1e-5,
                         hidden_layer_sizes=(64,), random_state=1))
])

In [15]:
len(tweets_flat)

2500000

In [10]:
start = time.time()
nn_pipe.fit(tweets_flat, y_tweets)
end = time.time()
print(end - start)

1249.0632450580597


In [47]:
from sklearn.svm import LinearSVC

pipe = Pipeline([
    ('vec', TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True)), 
    ('svm', LinearSVC())
])

param_grid = {
    'svm__C' : [0.1, 1, 5, 10],
}

CV_pipe = GridSearchCV(pipe, param_grid=param_grid, cv=2)

In [72]:
from sklearn.naive_bayes import *

pipe_nb = Pipeline([
    ('vec', TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True)), 
    ('bayes', MultinomialNB())
])

CV_pipe_nb = GridSearchCV(pipe_nb, param_grid={}, cv=5)

In [73]:
start = time.time()
CV_pipe_nb.fit(tweets_flat, y_tweets)
end = time.time()
print(end - start)

543.0757880210876


In [48]:
start = time.time()
CV_pipe.fit(tweets_flat, y_tweets)
end = time.time()
print(end - start)

1389.7558951377869


In [11]:
y_pred = nn_pipe.predict(tweets_flat)
print(classification_report(y_tweets, y_pred))  

             precision    recall  f1-score   support

         -1       0.83      0.79      0.81   1250000
          1       0.80      0.84      0.82   1250000

avg / total       0.81      0.81      0.81   2500000



In [12]:
cl_data = []

for i, l in enumerate(test_data):
    l = l.split(',', 1)
    id_ = l[0]
    tweet = l[1]
    cl_data.append([id_, tweet])

df = pd.DataFrame(cl_data)
df.columns = ['Id', 'Tweet']
df.head()

Unnamed: 0,Id,Tweet
0,1,sea doo pro sea scooter ( sports with the port...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


In [13]:
df.Tweet = parse_data(df.Tweet)
df.Tweet.head()

Total time (s): 16.88913369178772


0    sea doo pro sea scooter ( sport <<positive>> p...
1    <user> shuck <<positive>> well <<positive>> wo...
2           cant stay away <<negative>> bug thats baby
3    <user> ma'am <<emphasis>> lol im <<positive>> ...
4    whenever <<negative>> fall asleep watching tv ...
Name: Tweet, dtype: object

In [14]:
y_pred = nn_pipe.predict(df['Tweet'].as_matrix().flatten())

In [15]:
OUTPUT_FILE_PATH = os.path.join(OUTPUT_PATH, 'submission.csv')

res_df = pd.DataFrame({ 'Id': df['Id'].as_matrix().flatten(),
                        'Prediction': y_pred})

res_df = res_df.set_index('Id')
res_df.to_csv(OUTPUT_FILE_PATH)

#### Neural net

In [75]:
from sklearn.neural_network import MLPClassifier



#### Old code

In [22]:
t = '<user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n'
print('original:\n', t)
t = t.replace('\n', '')
words = t.split(' ')
print(words)

words = removeNumbers(words)
words = replaceHashtags(words)
words = emphasizeWords(words)
words = emphasizePunctiation(words)
words = removeStopwords(words)
words = normalizeWords(words)
words = checkPositiveNegative(words)
print(words)

t = ' '.join(words)
print('\nafter:\n', t)

original:
 <user> i .... this sucks ? ! ! haaappyyyyy dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

['<user>', 'i', '....', 'this', 'sucks', '?', '!', '!', 'haaappyyyyy', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
['<user>', '...', 'negatively', 'suck', '<emphasis>', 'positively', '<<happy>>', 'dunno', 'justin', 'read', 'mention', '.', 'justin', 'god', 'know', ',', 'hope', 'follow', '<hashtag>', 'believe', '<number>']

after:
 <user> ... negatively suck <emphasis> positively <<happy>> dunno justin read mention . justin god know , hope follow <hashtag> believe <number>
