In [53]:
import pandas as pd
import numpy as np
import spacy
import textblob
from gensim.models import word2vec
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels import robust
from string import punctuation
import gensim
from gensim.models import word2vec

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Neural Network
from sklearn.neural_network import MLPClassifier

import keras
from keras.optimizers import RMSprop
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.layers import LSTM

In [2]:
from dask.distributed import Client, progress
import dask.dataframe as dd
import dask.array as da

In [3]:
import os
# for mac only: frog,blow,funk,glass,tink,submarine,purr,sosumi
def beep(audio): 
    os.system('afplay /System/Library/Sounds/' + audio +'.aiff')
    os.system('afplay /System/Library/Sounds/' + audio +'.aiff')

## Create Dask Client

In [4]:
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:62654  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.00 GB


## Word Embedding - Word2Vec

In [5]:
review = dd.read_csv('boardgame-comments-english.csv').sample(frac=.1,random_state=42)
review = client.persist(review)

In [6]:
review.columns = 'reviewer_id', 'game_id', 'rating', 'comment'

# RATINGS ADJUSTMENT: ceiling >= .5 [or] floor < .5
ratings = da.array(review.rating.round()).compute()

print('Total Comments: {}'.format(len(review.comment)))

Total Comments: 84165


In [7]:
%%time
sentences = da.array(review.comment.apply(lambda val: [b.lower() for b in textblob.TextBlob(val).words])).compute()
beep('ping')

CPU times: user 1.44 s, sys: 247 ms, total: 1.68 s
Wall time: 26 s


In [8]:
%%time
load_model = False

if load_model:
    # # load model
    word_vec = word2vec.Word2Vec.load('full_word2vec_blob.bin')
    vec_size = word_vec.layer1_size
else: 
    vec_size = 50
    word_vec = word2vec.Word2Vec(
        sentences,
        workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
        min_count=5,  # Minimum word count threshold.
        window=6,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=vec_size,      # Word vector length.
        hs=1           # Use hierarchical softmax.
    )
    
    # save model
    word_vec.save('full_word2vec_blob.bin')

# List of words in model.
vocab = word_vec.wv.vocab.keys()
beep('ping')

CPU times: user 52.8 s, sys: 326 ms, total: 53.1 s
Wall time: 17.1 s


In [11]:
%%time
vec_new = np.array([.5 for i in range(0,vec_size)])
vectors = da.array(map(lambda val: [word_vec[w] if w in vocab else vec_new for w in val], sentences))
beep('ping')

CPU times: user 97.2 ms, sys: 14.6 ms, total: 112 ms
Wall time: 1.67 s


### TBD

In [45]:
w1,w2,w3 = 'gamer','player','game'
print(word_vec.most_similar(positive=[w1, w2], negative=[w3], topn=1))

w1 = 'board'
print(word_vec.wv.most_similar(positive=w1,topn=3))

w1 = 'hard'
print(word_vec.wv.most_similar(positive=w1,topn=3))

[('1-1', 0.6517734527587891)]
[('map', 0.6931158900260925), ('intent', 0.6568892598152161), ('gameboard', 0.6398290395736694)]
[('difficult', 0.8791060447692871), ('easy', 0.7894079089164734), ('tough', 0.720523476600647)]


# 6. Comprehensive Neural Network

In [12]:
%%time
pad, max_words = np.array([0 for i in range(0,vec_size)]), 30

def manual_pad(val):
    empty = max_words-len(val)
    for i in range(0,empty):
        val.append(pad)
    
    return [i for i in val[0:max_words]]

vectors_padding = list(map(manual_pad,vectors))
beep('ping')

CPU times: user 15.9 s, sys: 303 ms, total: 16.2 s
Wall time: 17.8 s


In [13]:
%%time
y = np.array(ratings).astype(int).ravel()
X = pd.DataFrame([list(i[0]) for i in vectors_padding])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

CPU times: user 1.53 s, sys: 139 ms, total: 1.66 s
Wall time: 1.67 s


#### _A - Logistic Regression w/ Vectors_

In [14]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
print('Check for overfitting:')
print(lr.score(X_train,y_train)*100)
print('')
# Print Model Score Estimation on Same Data
print('Percentage of ratings guessed correctly:')
print(lr.score(X_test,y_test)*100)

Check for overfitting:
30.205710232310697

Percentage of ratings guessed correctly:
29.836183618361833


#### _B - MLP Neural NN w/ Vectors_

In [16]:
mlp = MLPClassifier()
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [17]:
mlp.score(X_test,y_test)

0.29447344734473446

#### _C - Keras Sequential NN_

In [18]:
y_test = y_test.ravel()
y_train = y_train.ravel()

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

84165

In [None]:
model = Sequential()
model.add(LSTM(512, input_shape=(vec_size,max_words)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(11, activation='softmax'))

In [None]:
batch_size = 10
epochs = 2

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, y_test))

beep('ping')

In [49]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 1.943898053109163
Test accuracy: 0.29274527451350385


## Word Similarity Visualization