In [138]:
import pandas as pd
import numpy as np
import spacy
import textblob
from gensim.models import word2vec
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels import robust
from string import punctuation
import gensim
from gensim.models import word2vec

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Neural Network
from sklearn.neural_network import MLPClassifier

import keras
from keras.optimizers import RMSprop,SGD
from keras.layers import Conv1D,Conv2D, MaxPooling2D
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.layers import LSTM, Embedding

In [2]:
from dask.distributed import Client, progress
import dask.dataframe as dd
import dask.array as da

In [3]:
import os
# for mac only: frog,blow,funk,glass,tink,submarine,purr,sosumi
def beep(audio): 
    os.system('afplay /System/Library/Sounds/' + audio +'.aiff')

## Create Dask Client

In [4]:
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:63745  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.00 GB


## Word Embedding - Word2Vec

In [5]:
review = dd.read_csv('boardgame-comments-english.csv').sample(frac=.1,random_state=42)
review = client.persist(review)

In [6]:
review.columns = 'reviewer_id', 'game_id', 'rating', 'comment'

# RATINGS ADJUSTMENT: ceiling >= .5 [or] floor < .5
ratings = da.array(review.rating.round()).compute()

print('Total Comments: {}'.format(len(review.comment)))

Total Comments: 84165


In [7]:
%%time
sentences = da.array(review.comment.apply(lambda val: [b.lower() for b in textblob.TextBlob(val).words])).compute()
beep('ping')

CPU times: user 1.39 s, sys: 239 ms, total: 1.63 s
Wall time: 25 s


In [8]:
%%time
load_model = False

if load_model:
    # # load model
    word_vec = word2vec.Word2Vec.load('full_word2vec_blob.bin')
    vec_size = word_vec.layer1_size
else: 
    vec_size = 50
    word_vec = word2vec.Word2Vec(
        sentences,
        workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
        min_count=5,  # Minimum word count threshold.
        window=6,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=vec_size,      # Word vector length.
        hs=1           # Use hierarchical softmax.
    )
    
    # save model
    word_vec.save('full_word2vec_blob.bin')

# List of words in model.
vocab = word_vec.wv.vocab.keys()
beep('ping')

CPU times: user 53.8 s, sys: 307 ms, total: 54.1 s
Wall time: 16.3 s


In [9]:
%%time
vec_new = np.array([.5 for i in range(0,vec_size)])
vectors = da.array(map(lambda val: [word_vec[w] if w in vocab else vec_new for w in val], sentences))
beep('ping')

CPU times: user 53.5 ms, sys: 7.77 ms, total: 61.3 ms
Wall time: 837 ms


### TBD

In [10]:
w1,w2,w3 = 'gamer','player','game'
print(word_vec.most_similar(positive=[w1, w2], negative=[w3], topn=1))

w1 = 'board'
print(word_vec.wv.most_similar(positive=w1,topn=3))

w1 = 'hard'
print(word_vec.wv.most_similar(positive=w1,topn=3))

[('players', 0.5493175387382507)]
[('map', 0.6587929725646973), ('gameboard', 0.6442074179649353), ('website', 0.6419546604156494)]
[('difficult', 0.8738832473754883), ('easy', 0.765060544013977), ('tough', 0.6980513334274292)]


# 6. Comprehensive Neural Network

In [11]:
%%time
pad, max_words = np.array([0 for i in range(0,vec_size)]), 30

def manual_pad(val):
    empty = max_words-len(val)
    for i in range(0,empty):
        val.append(pad)
    
    return [i for i in val[0:max_words]]

vectors_padding = list(map(manual_pad,vectors))
beep('ping')

CPU times: user 15.9 s, sys: 278 ms, total: 16.2 s
Wall time: 16.7 s


In [12]:
client.close()

Future exception was never retrieved
future: <Future finished exception=CommClosedError('in <closed TCP>: Stream is closed',)>
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py", line 179, in read
    n_frames = yield stream.read_bytes(8)
  File "/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
tornado.iostream.StreamClosedError: Stream is closed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py", line 200, in read
    convert_stream_closed_error(self, e)
  File "/anaconda3/lib/python3.6/site-packages/distributed/comm/tcp.py", line 128, in convert_stream_closed_error
    raise CommClosedError("in %s: %s" % (obj, exc))
distributed.comm.

In [13]:
%%time
y = np.array(ratings).astype(int).ravel()
X = np.array(vectors_padding)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

CPU times: user 1.47 s, sys: 654 ms, total: 2.12 s
Wall time: 2.2 s


In [14]:
XtrSize,XtrMax,XtrVector = X_train.shape
XteSize,XteMax,XteVector = X_test.shape

#### _A - Logistic Regression w/ Vectors_

#### _B - MLP Neural NN w/ Vectors_

#### _C - Keras Sequential NN_

In [15]:
y_test = y_test.ravel()
y_train = y_train.ravel()

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [70]:
hidden_size = 500
batch_size = 32
epochs = 2

In [71]:
X.shape

(84165, 30, 50)

In [155]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=X.shape[1:]))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(1,kernel_size=(1)))
model.add(Dense(11, activation='softmax'))

In [151]:
model.compile(loss='categorical_crossentropy',
              optimizer=SGD(),
              metrics=['accuracy'])

In [152]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_82 (Dense)             (None, 30, 512)           26112     
_________________________________________________________________
dropout_63 (Dropout)         (None, 30, 512)           0         
_________________________________________________________________
dense_83 (Dense)             (None, 30, 512)           262656    
_________________________________________________________________
dropout_64 (Dropout)         (None, 30, 512)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 30, 1)             513       
_________________________________________________________________
dense_84 (Dense)             (None, 30, 11)            22        
Total params: 289,303
Trainable params: 289,303
Non-trainable params: 0
_________________________________________________________________


In [148]:
y_test.shape

(27775, 11)

In [149]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=2,
                    validation_data=(X_test, y_test))

beep('ping')

ValueError: Error when checking target: expected dense_81 to have 3 dimensions, but got array with shape (56390, 11)

In [114]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ValueError: Error when checking target: expected dense_38 to have 3 dimensions, but got array with shape (27775, 11)

## Word Similarity Visualization