## Goals

## Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import textblob
from gensim.models import word2vec
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels import robust
from string import punctuation

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


# Neural Network
import keras
from keras.optimizers import RMSprop
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils

Using TensorFlow backend.


# 1. Import and Add Basic Features

In [2]:
review = pd.read_csv('boardgame-comments-english.csv')
review.columns = 'reviewer_id', 'game_id', 'rating', 'comment'
review.head(2)

Unnamed: 0,reviewer_id,game_id,rating,comment
0,172640,24068,7.0,Good: Unique take on the hidden role games. T...
1,86674,24068,7.0,A neat social deduction game with multiple tea...


In [3]:
comments = review.comment

In [4]:
# RATINGS ADJUSTMENT: ceiling >= .5 [or] floor < .5
review['rating'] = review.rating.apply(round)

In [5]:
# Check for null values
review.isnull().any()

reviewer_id    False
game_id        False
rating         False
comment        False
dtype: bool

In [6]:
# Functions for finding percentage frequency (capital letters/punctuation)
def per_check(string_value, total):
    percentage = len(string_value)
    if percentage != 0:
        percentage = float(total / percentage) * 100
    else:
        percentage = 0
    return percentage

def punc_count(string_value):
    count = 0
    for c in string_value:
        if c in punctuation:
            count+= 1
    return per_check(string_value, count)

def caplet_count(string_value):
    count = 0
    for c in string_value:
        if c.isupper():
            count+= 1
    return per_check(string_value, count)      

review['c_len'] = review.comment.apply(len)
review['punc_count'] = review.comment.apply(punc_count)
review['caplet_count'] = review.comment.apply(caplet_count)

In [7]:
review.head(2)

Unnamed: 0,reviewer_id,game_id,rating,comment,c_len,punc_count,caplet_count
0,172640,24068,7,Good: Unique take on the hidden role games. T...,433,2.078522,1.616628
1,86674,24068,7,A neat social deduction game with multiple tea...,99,1.010101,1.010101


### Show Reviewer_ID and Game_ID Counts (Matthew's Suggestion)

In [8]:
userreview = review.groupby('reviewer_id')
# dflist = []
# for r in userreview:
#     dflist.append(r[1])  
review.reviewer_id.value_counts()[0:3]

187094    357
114589    347
35516     337
Name: reviewer_id, dtype: int64

In [9]:
gamereview = review.groupby('game_id')
# dflist = []
# for r in gamereview:
#     dflist.append(r[1])  
review.game_id.value_counts()[0:3]

13       11456
822      11415
30549     9420
Name: game_id, dtype: int64

In [10]:
del review['game_id']
del review['reviewer_id']

# 2. Natural Language Processing Features:

## _Spacy_

In [11]:
nlp = spacy.load('en')

In [12]:
%%time
docs = []
tokens = []
lemma = []
pos = []
deps = []
ents = []
sentences = []

def insert_null(l):
    return [(w if w else '0') for w in l]

# Create doc and len features for processing
for doc in nlp.pipe(review['comment'].astype('unicode').values,batch_size = 10, n_threads=4):
    if doc.is_parsed:
        docs.append(doc)
    else:
        # Add blanks if parse fails
        docs.append(None)
        
    for sentence in doc.sents:
        sentence = [
            token.lemma_.lower()
            for token in sentence
            if not token.is_stop
            and not token.is_punct
        ]
        
    sentences.append(sentence)

KeyboardInterrupt: 

In [13]:
# apply features
review['doc'] = docs
review['w_len'] = review.doc.apply(len)
review['tokens'] = review.doc.apply(lambda doc: insert_null([tok.text for tok in doc]))
review['lemma'] = review.doc.apply(lambda doc: insert_null([tok.lemma_ for tok in doc]))
review['pos'] = review.doc.apply(lambda doc: insert_null([tok.pos_ for tok in doc]))
review['deps'] = review.doc.apply(lambda doc: insert_null([tok.dep_ for tok in doc]))
review['ents'] = review.doc.apply(lambda doc: insert_null([tok.ent_type_ for tok in doc]))

AttributeError: 'DataFrame' object has no attribute 'doc'

In [None]:
review.c_len.max()
print(review.comment[review.c_len == 7977].iloc[0][0:300],end=''); print('...')

## _TextBlob_

In [None]:
%%time
blobs = review.comment.apply(lambda val: textblob.TextBlob(val))

In [None]:
review['sent_pol'] = blobs.apply(lambda val: val.sentiment[0])
review['sent_subj'] = blobs.apply(lambda val: val.sentiment[1])

_Pol = Sentiment Polarity (positive or negative word choice)_ <br>
_Subj = Sentiment Subjectivity (objective or subjective word choice)_

In [None]:
# DataFrame With New Spacy and TextBlob Features
review.tail(1)

# 3. Visuals

In [None]:
def MEDIAN_reject_outliers(data, m=3):
    data = data[abs(data - np.median(data)) < m*robust.mad(data)]
    return data[~np.isnan(data)].sort_values()


def MEAN_reject_outliers(data, m=3):
    data = data[abs(data - np.mean(data)) <= m*np.std(data)]
    return data[~np.isnan(data)].sort_values()

In [None]:
# Ratings
plt.figure(figsize=(9.2,4))
plt.hist(review.rating,bins=10)
plt.title('Rating Distribution')
plt.show()

plt.figure(figsize=(9,7))

plt.subplot(221)
sns.distplot(MEDIAN_reject_outliers(review.c_len))
# plt.title('Average Char Length');

plt.subplot(222)
sns.distplot(MEDIAN_reject_outliers(review.punc_count))
plt.xlabel('Rating');
plt.ylabel('Punc Percentatage');


plt.subplot(223)
plt.scatter(review.rating,review.c_len)
plt.xlabel('Rating')
plt.ylabel('Comment Length')
xlist = []
ylist = []
m,b = np.polyfit(review.rating, review.c_len, 1)
for i in range(0,11):
    ylist.append(i*m + b)
    xlist.append(i)
plt.plot(xlist,ylist,color='r')
# plt.title('Ratings by Review Len')


plt.subplot(224)
plt.scatter(review.rating,review.punc_count);
xlist = []
ylist = []
m,b = np.polyfit(review.rating, review.punc_count, 1)
for i in range(0,11):
    ylist.append(i*m + b)
    xlist.append(i)
plt.plot(xlist,ylist,color='r')
# plt.title('Ratings by Punctuation Percentage');
plt.xlabel('Rating');
plt.ylabel('Punct Percentage');

# 4. Models

In [None]:
features = ['c_len','caplet_count','punc_count','rating','sent_pol','sent_subj']

In [None]:
%%time
all_data = review[features].dropna()
y = review['rating'].values.reshape(-1, 1).ravel()
X = review[review.columns[features]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### _Cluster Model_

_Tries to find clusters in the data but doesnt predict anything_ (Not currently relevant)

In [None]:
# Principal Components Analysis PCA (reduces features to 2 for visualization)
pca = PCA(n_components=2)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)

# We know we're looking for three clusters.
n_clusters=10

# Declare and fit the model.
sc = SpectralClustering(n_clusters=n_clusters)
sc.fit(X_train_pca)

#Predicted clusters.
predict=sc.fit_predict(X_train_pca)

In [None]:
X_train_pca_df = pd.DataFrame(X_train_pca)
X_train_pca_df.columns = ['comp_1','comp_2']
X_train_pca_df['hue'] = predict

plt.plot([i for i in range(1,11)],list(X_train_pca_df.hue.value_counts()),color ='b');
plt.plot([i for i in range(1,11)],list(review.rating.value_counts()),color ='g');
plt.legend(['Predicted Cluster','Actual Ratings']);
plt.title('Cluster Prediction of Counts by Ratings')
plt.show()

In [None]:
plt.figure(figsize=(22,22));
sns.lmplot(x='comp_1',y='comp_2', hue='hue',data=X_train_pca_df,fit_reg=False)
plt.xlim(-250,1000)
plt.ylim(-5,15)
plt.show()

### _Logistic Regression_

_Operates on probabilities_

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
print('Check for overfitting:')
print(lr.score(X_train,y_train)*100)
print('')
# Print Model Score Estimation on Same Data
print('Percentage of ratings guessed correctly:')
print(lr.score(X_test,y_test)*100)

# 5. Word Embedding

In [None]:
import gensim
from gensim.models import word2vec

word_vec = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=3,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = word_vec.wv.vocab.keys()

In [None]:
word_vec.most_similar(positive=['gamer', 'player'], negative=['person'], topn=1)

In [None]:
w1 = 'board'
print(word_vec.wv.most_similar(positive=w1,topn=3))

w1 = 'hard'
print(word_vec.wv.most_similar(positive=w1,topn=3))

In [None]:
max_words = 200
def vectorize(val):
    empty_spaces = max_words - len(val)
    l = []
    for w in val[0:(max_words+1)]:
        new_w = str(w)
        if w in vocab:
            l.append(word_vec[w])
        else:
            l.append([1 for i in range(0,300)])
            
    if empty_spaces > 0:
        for e in range(0,empty_spaces):
            l.append([0 for i in range(0,300)])
    return l

In [None]:
review['vectors'] = review.lemma.apply(vectorize)

In [None]:
vectorlist = review.vectors
vectorlist_merged = [list(i[0]) for i in vectorlist]

In [None]:
vectordf = pd.DataFrame(vectorlist_merged)

In [None]:
%%time
y = review['rating'].astype(int).ravel()
X = vectordf.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
print('Check for overfitting:')
print(lr.score(X_train,y_train)*100)
print('')
# Print Model Score Estimation on Same Data
print('Percentage of ratings guessed correctly:')
print(lr.score(X_test,y_test)*100)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(X_train,y_train)

In [None]:
mlp.score(X_test,y_test)

__Tutorial for Visualization of Word Vectors__

In [None]:
X = word_vec[word_vec.wv.vocab]
graph_pca = PCA(n_components=2)
result = graph_pca.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,20))
pyplot.scatter(result[:, 0], result[:, 1])
words = list(word_vec.wv.vocab)
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.ylim(-0.006,0.008)
plt.xlim(-.02,.04)
plt.show()

In [None]:
%%time
y = review['rating'].astype(int).ravel()
X = vectordf.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
y_test = y_test.ravel()
y_train = y_train.ravel()

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [None]:
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(300,)))
model.add(Dropout(0.1))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(11, activation='softmax'))

In [None]:
batch_size = 6
epochs = 100

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, y_test))

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])