## Libraries

In [15]:
import pandas as pd
from collections import Counter
import re
import random
import numpy as np
from string import punctuation as punct
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import v_measure_score

# for mac only: frog,blow,funk,glass,tink,submarine,purr,sosumi
def beep(audio): 
    os.system('afplay /System/Library/Sounds/' + audio +'.aiff')

# 1. Import and Add Basic Features

In [3]:
boardgamereviews = pd.read_csv('../data/boardgame/boardgame-comments-english.csv')
boardgamereviews.columns = 'reviewer_id', 'game_id', 'rating', 'comment'
unique_id = boardgamereviews.reviewer_id.unique()
sample_id = random.sample(unique_id.tolist(),2000)
review = boardgamereviews[boardgamereviews['reviewer_id'].isin(sample_id)]
review['rating_normed'] = review.rating - review['rating'].groupby(review['reviewer_id']).transform('mean')
review['rating_normed']= (review['rating_normed']-review['rating_normed'].min())/(review['rating_normed'].max()-review['rating_normed'].min())
# review['estimate'] = review['rating_normed'] + review['rating'].groupby(review['reviewer_id']).transform('mean')

# 2. Tokenize

In [33]:
%%time
only_chars = r'[^\w\s]'
review['tokens'] = review.comment.apply(lambda val: re.sub(only_chars,'',val).lower().split(' '))

CPU times: user 460 ms, sys: 36 ms, total: 496 ms
Wall time: 501 ms


In [93]:
def find_vocab_set(series):
    vocab = []
    for s in series.tolist():
        vocab+= s
    vocab = Counter(vocab)
    del vocab['']
    for w in stopwords.words('english'):
        if vocab[w]:
            del vocab[w]
    return Counter(vocab)

vocab_ratings = find_vocab_set(review.tokens)
values=[]

for i in range(1,11):
    val = find_vocab_set(review.tokens[(review.rating_normed < i/10) & review.rating_normed >=((i-1)/10)])
    values.append(val)

Counter({'love': 2084,
         'builder': 117,
         'breviewb': 1,
         'thread1396177thread': 1,
         'great': 4472,
         '4': 1217,
         'players': 4372,
         'fun': 6053,
         'little': 2050,
         'engine': 224,
         'building': 811,
         'game': 29289,
         'even': 1622,
         'better': 2057,
         'randomness': 251,
         'harbor': 15,
         'expansion': 1331,
         'like': 6062,
         'one': 5286,
         'lot': 2296,
         'im': 1777,
         'sucker': 20,
         'motor': 3,
         'racing': 83,
         'games': 4728,
         'tons': 115,
         'theme': 1917,
         'comes': 294,
         'incredibly': 125,
         'well': 2516,
         'innovative': 95,
         'familyfriendly': 4,
         'everyonefriendly': 1,
         'actually': 572,
         'nice': 1572,
         'brainmassage': 1,
         'balance': 275,
         'luck': 1386,
         'skill': 193,
         'mostly': 256,
         'playe

# 3. Visualize Features

# 4. Models

In [75]:
features = ['c_len','caplet_count','punc_count','rating','sent_pol','sent_subj']

In [76]:
%%time
y = review['rating']
X = review[features].drop('rating',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

CPU times: user 3.93 ms, sys: 940 µs, total: 4.87 ms
Wall time: 4.13 ms


In [77]:
X_train.head()

Unnamed: 0,c_len,caplet_count,punc_count,sent_pol,sent_subj
569611,34,5.882353,8.823529,1.0,0.3
486024,33,3.030303,18.181818,0.5,0.4
37120,104,4.807692,4.807692,0.283333,0.638889
401751,133,0.75188,1.503759,0.1375,0.4875
520649,202,3.465347,2.970297,0.04375,0.25


### _Cluster Model_

_Tries to find clusters in the data but doesnt predict anything_ (Not currently relevant)

In [78]:
pca = PCA(n_components=2)
X_train_pca = pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_train_pca_df = pd.DataFrame(X_train_pca)

In [79]:
%%time
c_range = range(2,13)
predict = []
for i in c_range:
    # Declare and fit the model.
    sc = SpectralClustering(n_clusters=i)
    predict.append(sc.fit_predict(X_train_pca_df))

    #Predicted clusters.
    print('{} completed'.format(i))

2 completed
3 completed
4 completed
5 completed
6 completed
7 completed
8 completed
9 completed
10 completed
CPU times: user 6min 33s, sys: 3.61 s, total: 6min 37s
Wall time: 3min 33s
