In [1]:
import pandas as pd
import numpy as np
import bcolz
import pickle
import nltk
import re
import xgboost
import m2cgen as m2c
import json

## Import stop words

In [2]:
STOP_WORDS = pd.read_csv("stopwords.csv", header=0)
STOP_WORDS = list(STOP_WORDS['STOP_WORD'])

## Import the [Sentiment140](http://help.sentiment140.com/for-students/) labelled dataset to train our classifier

In [3]:
df = pd.read_csv("trainingandtestdata/training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)

## Get DataFrame with only 2 columns: Text and Sentiment

In [4]:
tweet_texts_pd = df[[5]]
tweet_texts_pd['Sentiment'] = np.where(df[[0]] == 4, "1", "0")
tweet_texts_pd.rename(columns = {5: "Text"}, inplace=True)
tweet_texts_pd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,Text,Sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
...,...,...
1599995,Just woke up. Having no school is the best fee...,1
1599996,TheWDB.com - Very cool to hear old Walt interv...,1
1599997,Are you ready for your MoJo Makeover? Ask me f...,1
1599998,Happy 38th Birthday to my boo of alll time!!! ...,1


## For simplicity take n positive and n negative tweets

In [5]:
n = 5000
tweet_texts_pd = pd.concat([tweet_texts_pd[tweet_texts_pd['Sentiment'] == "1"][:n], tweet_texts_pd[tweet_texts_pd['Sentiment'] == "0"][:n]])
tweet_texts_pd.reset_index(inplace=True)
tweet_texts_pd

Unnamed: 0,index,Text,Sentiment
0,800000,I LOVE @Health4UandPets u guys r the best!!,1
1,800001,im meeting up with one of my besties tonight! ...,1
2,800002,"@DaRealSunisaKim Thanks for the Twitter add, S...",1
3,800003,Being sick can be really cheap when it hurts t...,1
4,800004,@LovesBrooklyn2 he has that effect on everyone,1
...,...,...,...
9995,4995,long day today,0
9996,4996,a friend broke his promises..,0
9997,4997,@gjarnling I am fine thanks - tired,0
9998,4998,trying to keep my eyes open..damn baking,0


## Unpack and preprcoess glove word vectors (no need to run this)

In [6]:
glove_path = "glove"

In [23]:
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

## Use [glove](https://nlp.stanford.edu/projects/glove/) as a pretrained word vectors

In [7]:
vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

## Define a tokenizer to transform tweet texts into sequences of words characterizing them

In [8]:
def tokenize(s):
    series = pd.Series(re.split("\W", s)).str.lower()
    return series[(series != "") & (series.str.len() > 2) & ~(series.isin(STOP_WORDS))].tolist()

## Tokenize tweet texts

In [9]:
words_pd = tweet_texts_pd["Text"].apply(tokenize)

## Transform tweets into vectors

In [10]:
words_vectorized = []
for l in words_pd:
    arr = []
    for w in l:
        if w in glove:
            arr.append(glove[w])
    if not arr:
        res = np.zeros((50))
    else:
        res = np.mean(np.asarray(arr), axis=0)
    words_vectorized.append(res)
words_vectorized = np.asarray(words_vectorized)
words_vectorized.shape

(10000, 50)

## Train xgboost classifier

In [11]:
clf = xgboost.XGBClassifier(n_estimators=5, random_state=7, max_depth=5, num_leaves=5)
clf.fit(words_vectorized, tweet_texts_pd["Sentiment"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=5, n_jobs=1,
              nthread=None, num_leaves=5, objective='binary:logistic',
              random_state=7, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

## Export classifier using [m2cgen](https://github.com/BayesWitnesses/m2cgen) library

In [12]:
model = m2c.export_to_java(clf, package_name="realtime_events.ml", class_name="SentimentPredictor")
with open('Model.java', 'w') as f:
    f.write(model)

## Export word2vec into json to use in flink pipeline

In [13]:
def jsonify_values(d):
    res = {}
    for k, v in d.items():
        res[k] = v.tolist()
    return res
json.dump(jsonify_values(glove), open("word2vec.json", "w"))