In [77]:
#import
import numpy as np
import nltk
import string
from nltk.tokenize import TweetTokenizer
from collections import Counter
import pandas as pd
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import gensim
import pickle
import time
import csv

In [12]:
def tokenize(text):
	tknzr = TweetTokenizer()
	return tknzr.tokenize(text)

In [110]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    idx = time.time()
    name += str(idx) +'.csv'
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [131]:
data = pickle.load(open('dumped_files/corrected_datasets_pos_neg_test.p','rb'))

In [112]:
final_dict = data = pickle.load(open('dumped_files/final_tokens_dictionary.p','rb'))

In [113]:
len(final_dict), len(set(final_dict.values()))

(604480, 387494)

In [133]:
pos = list(set(data[0]))
neg = list(set(data[1]))
test = data[2]

In [134]:
lemmatizer = WordNetLemmatizer()

In [135]:
pos_tokens = []
for tweet in pos:
    pos_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
pos_counter = Counter([tk for tokens in pos_tokens for tk in tokens])

In [136]:
neg_tokens = []
for tweet in neg:
    neg_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
neg_counter = Counter([tk for tokens in neg_tokens for tk in tokens])

In [137]:
pos_most = pos_counter.most_common()
neg_most = neg_counter.most_common()

In [138]:
pos_dict = {}
for (w, f) in pos_most:
    pos_dict[w] = f
neg_dict = {}
for (w, f) in neg_most:
    neg_dict[w] = f

In [139]:
all_words = list(set(list(pos_dict) + list(neg_dict)))
pos_frq = [pos_dict[w] if w in pos_dict else 0 for w in all_words]
neg_frq = [neg_dict[w] if w in neg_dict else 0 for w in all_words]

In [140]:
df = {'word' : all_words, 'pos':pos_frq, 'neg':neg_frq}
df = pd.DataFrame(data=df)

In [141]:
df['pos_ratio'] = df.apply(lambda row: round(100*row['pos']/(row['pos']+row['neg']),2), axis=1)
df['neg_ratio'] = df.apply(lambda row: 100 - row['pos_ratio'], axis=1)

In [142]:
df = df.set_index('word')
df.tail(20)

Unnamed: 0_level_0,neg,pos,pos_ratio,neg_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rosamund,1,0,0.0,100.0
priscilla,29,20,40.82,59.18
simoncelli,1,1,50.0,50.0
uproot,6,1,14.29,85.71
parcel,48,53,52.48,47.52
bucket,252,228,47.5,52.5
muschamp,1,0,0.0,100.0
expose,214,68,24.11,75.89
sonofabitch,9,0,0.0,100.0
assemblage,6,0,0.0,100.0


In [143]:
df['abs_diff'] = df.apply(lambda row: np.abs(row['pos_ratio']-row['neg_ratio']), axis=1)
df['total'] = df.apply(lambda row: row['pos']+row['neg'], axis=1)

In [144]:
df.sort_values(['abs_diff', 'total'], ascending=[1,0])

Unnamed: 0_level_0,neg,pos,pos_ratio,neg_ratio,abs_diff,total
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
picture,6499,6498,50.0,50.0,0.0,12997.0
yeh,1550,1550,50.0,50.0,0.0,3100.0
headed,522,522,50.0,50.0,0.0,1044.0
dating,469,469,50.0,50.0,0.0,938.0
loo,331,331,50.0,50.0,0.0,662.0
qua,302,302,50.0,50.0,0.0,604.0
leo,236,236,50.0,50.0,0.0,472.0
thalia,187,187,50.0,50.0,0.0,374.0
darling,157,157,50.0,50.0,0.0,314.0
kp,146,146,50.0,50.0,0.0,292.0


In [145]:
stop_words = [line.rstrip('\n').lower() for line in open('data/stopwords.txt')] + ['user', 'url', 'rt']

In [146]:
df['word_'] = df.index

In [125]:
stop_df = df.loc[df.apply(lambda row: row['word_'] in stop_words, axis=1)]

In [126]:
stop_df.sort_values(['abs_diff'], ascending=[1])

Unnamed: 0_level_0,neg,pos,pos_ratio,neg_ratio,abs_diff,total,word_
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
what,114135,114025,49.98,50.02,0.04,228160.0,what
after,10126,10043,49.79,50.21,0.42,20169.0,after
be,6806,6880,50.27,49.73,0.54,13686.0,be
my,255742,252745,49.71,50.29,0.58,508487.0,my
themselves,294,290,49.66,50.34,0.68,584.0,themselves
they,28936,28471,49.59,50.41,0.82,57407.0,they
am,104757,102878,49.55,50.45,0.90,207635.0,am
being,23359,22881,49.48,50.52,1.04,46240.0,being
do,93985,90265,48.99,51.01,2.02,184250.0,do
to,341120,326481,48.90,51.10,2.20,667601.0,to


In [127]:
stop_df.loc[stop_df.abs_diff<20]

Unnamed: 0_level_0,neg,pos,pos_ratio,neg_ratio,abs_diff,total,word_
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
have,110465,104187,48.54,51.46,2.92,214652.0,have
m,1262,1151,47.70,52.30,4.60,2413.0,m
by,42010,35427,45.75,54.25,8.50,77437.0,by
at,60308,57411,48.77,51.23,2.46,117719.0,at
she,25201,32088,56.01,43.99,12.02,57289.0,she
do,93985,90265,48.99,51.01,2.02,184250.0,do
our,11178,14147,55.86,44.14,11.72,25325.0,our
before,7379,7918,51.76,48.24,3.52,15297.0,before
herself,318,234,42.39,57.61,15.22,552.0,herself
into,6887,5165,42.86,57.14,14.28,12052.0,into


In [128]:
del_words = list(stop_df.loc[stop_df.total<20].index) + ['user', 'url', 'rt', 'twitter', 'facebook']

In [129]:
pos_tokens = [[t for t in tokens if len(t)>2] for tokens in pos_tokens
             if len(tokens)>0]

In [130]:
neg_tokens = [[t for t in tokens if len(t)>2] for tokens in neg_tokens 
              if len(tokens)>0]

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc (zmq\backend\cython\message.c:4294)
    PyErr_CheckSignals()
KeyboardInterrupt


In [147]:
test_tokens = []
for tweet in test:
    test_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
#test_tokens = [[t for t in tokens if t not in del_words and len(t)>2 and t in model.wv.vocab] for tokens in test_tokens]

In [98]:
test_tokens = [['empty'] if len(t)<1 else t for t in test_tokens ]

### Word2Vec

In [159]:
model = gensim.models.Word2Vec(pos_tokens + neg_tokens + test_tokens, size=300, window=5, min_count=1, workers=4)

In [160]:
model.most_similar('friend')

[('roommate', 0.5838409662246704),
 ('classmate', 0.5687693953514099),
 ('cousin', 0.5555226802825928),
 ('sister', 0.5229045152664185),
 ('girlfriend', 0.5219735503196716),
 ('roomie', 0.49464720487594604),
 ('husband', 0.47823768854141235),
 ('boyfriend', 0.4758955240249634),
 ('ie', 0.4735199809074402),
 ('teammate', 0.46988964080810547)]

In [161]:
def tweet2vector(tweet_tokens, model):
    return sum([model[word] for word in tweet_tokens])

In [162]:
pos_vec = np.asarray([tweet2vector(tweet, model) for tweet in pos_tokens])
neg_vec = np.asarray([tweet2vector(tweet, model) for tweet in neg_tokens])



In [163]:
# Concatenate both
X = np.vstack((pos_vec, neg_vec))

In [164]:
y = [1 for i in range(len(pos_vec))] + [-1 for i in range(len(neg_vec))]

In [165]:
test_vec = np.asarray([tweet2vector(tweet, model) for tweet in test_tokens])

### Learning

In [166]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 30), random_state=1)

In [167]:
clf.fit(X, y)

KeyboardInterrupt: 

In [None]:
predictions = clf.predict(test_vec)

In [None]:
create_csv_submission(np.arange(len(predictions))+1, predictions, 'submissions/prediction')