In [1]:
#import
import numpy as np
import nltk
import string
from nltk.tokenize import TweetTokenizer
from collections import Counter
import pandas as pd
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import gensim
import pickle
import time
import csv



In [2]:
def tokenize(text):
	tknzr = TweetTokenizer()
	return tknzr.tokenize(text)

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    idx = time.time()
    name += str(idx) +'.csv'
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
[pos, neg, test] = pickle.load(open('dumped_files/corrected_datasets_pos_neg_test.p','rb'))

In [None]:
final_dict = pickle.load(open('dumped_files/final_tokens_dictionary.p','rb'))

In [None]:
len(final_dict), len(set(final_dict.values()))

In [None]:
pos = list(set(pos))
neg = list(set(neg))

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
pos_tokens = []
for tweet in pos:
    pos_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
pos_counter = Counter([tk for tokens in pos_tokens for tk in tokens])

In [None]:
neg_tokens = []
for tweet in neg:
    neg_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
neg_counter = Counter([tk for tokens in neg_tokens for tk in tokens])

In [None]:
pos_most = pos_counter.most_common()
neg_most = neg_counter.most_common()

### Dataframe

In [None]:
pos_dict = {}
for (w, f) in pos_most:
    pos_dict[w] = f
neg_dict = {}
for (w, f) in neg_most:
    neg_dict[w] = f

In [None]:
all_words = list(set(list(pos_dict) + list(neg_dict)))
pos_frq = [pos_dict[w] if w in pos_dict else 0 for w in all_words]
neg_frq = [neg_dict[w] if w in neg_dict else 0 for w in all_words]

In [None]:
df = {'word' : all_words, 'pos':pos_frq, 'neg':neg_frq}
df = pd.DataFrame(data=df)

In [None]:
df['pos_ratio'] = df.apply(lambda row: round(100*row['pos']/(row['pos']+row['neg']),2), axis=1)
df['neg_ratio'] = df.apply(lambda row: 100 - row['pos_ratio'], axis=1)

In [None]:
df = df.set_index('word')
df.tail(20)

In [None]:
df['abs_diff'] = df.apply(lambda row: np.abs(row['pos_ratio']-row['neg_ratio']), axis=1)
df['total'] = df.apply(lambda row: row['pos']+row['neg'], axis=1)

In [None]:
df.sort_values(['abs_diff', 'total'], ascending=[1,0])

In [None]:
stop_words = [line.rstrip('\n').lower() for line in open('data/stopwords.txt')] + ['user', 'url', 'rt']

In [None]:
df['word_'] = df.index

In [None]:
stop_df = df.loc[df.apply(lambda row: row['word_'] in stop_words, axis=1)]

In [None]:
stop_df.sort_values(['abs_diff'], ascending=[1])

In [None]:
stop_df.loc[stop_df.abs_diff<20]

In [None]:
del_words = list(stop_df.loc[stop_df.total<20].index) + ['user', 'url', 'rt', 'twitter', 'facebook']

In [None]:
pos_tokens = [[t for t in tokens if len(t)>2] for tokens in pos_tokens
             if len(tokens)>0]

In [None]:
neg_tokens = [[t for t in tokens if len(t)>2] for tokens in neg_tokens 
              if len(tokens)>0]

In [15]:
test_tokens = []
for tweet in test:
    test_tokens.append([lemmatizer.lemmatize(w) for w in tokenize(tweet)])
#test_tokens = [[t for t in tokens if t not in del_words and len(t)>2 and t in model.wv.vocab] for tokens in test_tokens]

In [16]:
test_tokens = [['empty'] if len(t)<1 else t for t in test_tokens ]

### Delete stop words

In [None]:
stop_words = [line.rstrip('\n').lower() for line in open('../data/stopwords.txt')]

In [None]:
def check(tweet_list_tokens):
    return [t if t != [] else ['empty'] for t in tweet_list_tokens]

def del_stopWords(tweet_list_tokens):
    return [[t for t in tokens if t not in stop_words] for tokens in tweet_list_tokens]

In [None]:
pos_tokens = del_stopWords(pos_tokens)
neg_tokens = del_stopWords(neg_tokens)

In [None]:
test_tokens = del_stopWords(test_tokens)

In [None]:
pos_tokens = list(np.unique(check(pos_tokens)))
neg_tokens = list(np.unique(check(neg_tokens)))
test_tokens = list(check(test_tokens))

### Word2Vec

In [193]:
model = gensim.models.Word2Vec(pos_tokens + neg_tokens + test_tokens, size=300, window=5, min_count=1, workers=4)

In [194]:
model.wv.most_similar("suck", topn=20)

[('sucked', 0.5670109987258911),
 ('skeptical', 0.5180200338363647),
 ('hate', 0.5033948421478271),
 ('eff', 0.4981948137283325),
 ('fuck', 0.44312357902526855),
 ('ugh', 0.42857789993286133),
 ('suckling', 0.42300838232040405),
 ('pissed', 0.41378408670425415),
 ('dick', 0.4098828136920929),
 ('grounded', 0.4055216312408447),
 ('shirty', 0.40396714210510254),
 ('hating', 0.40323224663734436),
 ('stinky', 0.39098405838012695),
 ('irritated', 0.3862936496734619),
 ('growling', 0.3821266293525696),
 ('lame', 0.3762311637401581),
 ('annoying', 0.3715893626213074),
 ('complaint', 0.3673654794692993),
 ('blah', 0.3663901090621948),
 ('kidding', 0.36580824851989746)]

In [198]:
pickle.dump(model, open('word2vec_model_noStWords.p', 'wb'))

In [199]:
model = pickle.load(open('word2vec_model_noStWords.p', 'rb'))

### TF-IDF

In [200]:
test_tokens[0]

['sea',
 'doo',
 'pro',
 'sea',
 'scooter',
 'sport',
 'portable',
 'shadow',
 'sea',
 'scooter',
 'save',
 'air',
 'stay',
 'longer',
 'water']

In [201]:
corpus = [' '.join(tokens) for tokens in pos_tokens + neg_tokens + test_tokens]

KeyboardInterrupt: 

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_

In [101]:
tf_idf_dict = dict(zip(vectorizer.get_feature_names(), idf))

### Tweet2Vec

In [202]:
def tweet2vector(tweet_tokens, model, tf_idf_dict):
    default = np.zeros_like(model['happy'])
    return sum([tf_idf_dict[word]*model[word] if word in model.wv.vocab and word in tf_idf_dict.keys() else default
                for word in tweet_tokens ])/len(tweet_tokens)

In [203]:
pos_vec = np.asarray([tweet2vector(tweet, model, tf_idf_dict) for tweet in pos_tokens])
neg_vec = np.asarray([tweet2vector(tweet, model, tf_idf_dict) for tweet in neg_tokens])
test_vec = np.asarray([tweet2vector(tweet, model, tf_idf_dict) for tweet in test_tokens])

In [204]:
# Concatenate both
X = np.vstack((pos_vec, neg_vec))

In [205]:
y = [1 for i in range(len(pos_vec))] + [-1 for i in range(len(neg_vec))]

In [None]:
pickle.dump([X], open('Xy.p', 'wb'))

### Separate data to train and test

In [208]:
import numpy as np
from sklearn.model_selection import train_test_split

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

MemoryError: 

In [None]:
pickle.dump([X_train, X_test, y_train, y_test], open('train_test_splited_word2Vec0.05.p', 'wb'))

In [None]:
def accuracy(y1, y2):
	return 100 - np.sum(np.abs(np.asarray(y1) - np.asarray(y2))/2)*100/len(y1)

In [None]:
accuracy([-1,-1,1], [1, -1, 1])

### Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X)

### Test Models

In [None]:
[X_train, X_test, y, y_test] = pickle.load( open('train_test_splited_word2Vec0.05.p', 'rb'))

In [None]:
mlp =  pickle.load( open('../Experiments/models/mlp.1512236972.0826764.model', 'rb'))

In [None]:
a = mlp.predict(X_test)

In [None]:
len([i for i in a if i==1])

### Learning

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 30), random_state=1)

In [None]:
clf.fit(X, y)

In [None]:
X_test = scaler.transform(test_vec)
len(X_test)

In [None]:
pickle.dump([X, y, test_vec], open('train_test_word2Vec.p', 'wb'))

In [None]:
predictions = clf.predict(X_test)

In [None]:
create_csv_submission(np.arange(len(predictions))+1, predictions, 'submissions/prediction')