https://stackoverflow.com/questions/34033785/normalize-values-in-dataframe

## Libraries

In [1]:
import pandas as pd
import re
import numpy as np
import spacy
import textblob
from gensim.models import word2vec
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels import robust
from string import punctuation
import gensim
from gensim.models import word2vec
from string import punctuation as punct

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import SpectralClustering
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import v_measure_score

# Neural Network
from sklearn.neural_network import MLPClassifier

# for mac only: frog,blow,funk,glass,tink,submarine,purr,sosumi
def beep(audio): 
    os.system('afplay /System/Library/Sounds/' + audio +'.aiff')

# 1. Import and Add Basic Features

In [2]:
review = pd.read_csv('../../data/boardgame/boardgame-comments-english.csv').sample(frac=.001)

In [3]:
import random

In [4]:
review.columns = 'reviewer_id', 'game_id', 'rating', 'comment'
unique_id = review.reviewer_id.unique()
sample_id = random.sample(unique_id.tolist(),200)
reviewsample = review[review['reviewer_id'].isin(unique_id)]

In [5]:
comments = review.comment

__Round Ratings__

In [6]:
# RATINGS ADJUSTMENT: ceiling >= .5 [or] floor < .5
review['rating'] = review.rating.apply(round)

__Show ReviewerID and GameID Counts__

# __2. Ratings Distribution__

_A. Bayesian Average_

B. Min/Max Transform (Sklearn Preprocessing Doesn't Apply)

_C. Normalize by User Ratings (Not Working Yet)_

https://stackoverflow.com/questions/34033785/normalize-values-in-dataframe

_D. Log Probability Distribution [NOT CORRECT]_

# 3. Natural Language Processing Features:

In [7]:
# Functions for finding percentage frequency (capital letters/punctuation)
def per_check(string_value, total):
    percentage = len(string_value)
    if percentage != 0:
        percentage = float(total / percentage) * 100
    else:
        percentage = 0
    return percentage

def punc_count(string_value):
    count = 0
    for c in string_value:
        if c in punctuation:
            count+= 1
    return per_check(string_value, count)

def caplet_count(string_value):
    count = 0
    for c in string_value:
        if c.isupper():
            count+= 1
    return per_check(string_value, count)      

review['c_len'] = review.comment.apply(len)
review['punc_count'] = review.comment.apply(punc_count)
review['caplet_count'] = review.comment.apply(caplet_count)

## _Spacy_

In [8]:
nlp = spacy.load('en')

In [9]:
%%time
docs = []
tokens = []
lemma = []
pos = []
deps = []
ents = []
sentences = []

def insert_null(l):
    return [(w if w else '0') for w in l]

pipeline = nlp.pipe(review['comment'].astype('unicode').values,
                    batch_size = 10, 
                    n_threads=4)

CPU times: user 660 µs, sys: 96 µs, total: 756 µs
Wall time: 1.28 ms


In [10]:
# Create doc
review['doc'] = [doc if doc.is_parsed else None for doc in pipeline]
beep('ping')

NameError: name 'os' is not defined

In [None]:
# apply features
review['w_len'] = review.doc.apply(len)
review['tokens'] = review.doc.apply(lambda doc: insert_null([tok.text for tok in doc]))
review['lemma'] = review.doc.apply(lambda doc: insert_null([tok.lemma_ for tok in doc]))
review['pos'] = review.doc.apply(lambda doc: insert_null([tok.pos_ for tok in doc]))
review['deps'] = review.doc.apply(lambda doc: insert_null([tok.dep_ for tok in doc]))
review['ents'] = review.doc.apply(lambda doc: insert_null([tok.ent_type_ for tok in doc]))
beep('ping')

## _TextBlob_

In [None]:
%%time
blobs = review.comment.apply(lambda val: textblob.TextBlob(val))

In [None]:
review['sent_pol'] = blobs.apply(lambda val: val.sentiment[0])
review['sent_subj'] = blobs.apply(lambda val: val.sentiment[1])

_Pol = Sentiment Polarity (positive or negative word choice)_ <br>
_Subj = Sentiment Subjectivity (objective or subjective word choice)_

In [None]:
# Additional Textblob Features (Not included because of Spacy)
review['wc'] = blobs.apply(lambda val: len(val.words))
review['sc'] = blobs.apply(lambda val: len(val.sentences))
review['tokens'] = blobs.apply(lambda val: [w.lower() for w in val.words])
review['pos'] = blobs.apply(lambda val: [v[1] for v in val.tags])

# 3. Visuals

# 4. Models

In [None]:
features = ['c_len','caplet_count','punc_count','rating','sent_pol','sent_subj']

In [None]:
%%time
y = review['rating']
X = review[features].drop('rating',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.head()

### _Cluster Model_

_Tries to find clusters in the data but doesnt predict anything_ (Not currently relevant)

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_train_pca_df = pd.DataFrame(X_train_pca)

In [None]:
%%time
c_range = range(2,13)
predict = []
for i in c_range:
    # Declare and fit the model.
    sc = SpectralClustering(n_clusters=i)
    predict.append(sc.fit_predict(X_train_pca_df))

    #Predicted clusters.
    print('{} completed'.format(i))

## _V-Measurement Test_

In [None]:
plt.scatter(x=[c_range],y=[v_measure_score(y_train,v) for v in predict])
plt.title('Accuracy of Cluster Predictions')
plt.xlabel('Cluster Count')
plt.show()

# for i,v in enumerate(predict):
#     print(clusters[i])
#     print(v_measure_score(y_train,predict[i]))

## _Likelihood Function_

$$ p(Y\mid\theta) = \prod_i^n {p({y_i}\mid\theta)} $$

## _Maximum Likelihood Function_

https://wikimedia.org/api/rest_v1/media/math/render/svg/9dc95691ee450e85995f5e3263600cb904323ee8

$$ \frac{1}{n}\sum_i^n \ln{p({y_i}\mid\theta)} $$

### _Logistic Regression_

_Operates on probabilities_

# 5. Word Embedding - Word2Vec

In [16]:
import os

In [17]:
%%time
review = pd.read_csv('../../data/boardgame/boardgame-comments-english.csv').sample(frac=.001,random_state=42)
review.columns = 'reviewer_id', 'game_id', 'rating', 'comment'

# RATINGS ADJUSTMENT: ceiling >= .5 [or] floor < .5
review['rating'] = review.rating.map(round)

print('Total Comments: {}'.format(review.comment.count()))

Total Comments: 842
CPU times: user 2.73 s, sys: 562 ms, total: 3.29 s
Wall time: 4.95 s


In [18]:
%%time
tokenize = lambda val: [b.lower() for b in textblob.TextBlob(val).words]
sentences_blob = review.comment.map(tokenize)
review['token'] = sentences_blob
beep('ping')

CPU times: user 627 ms, sys: 19.5 ms, total: 647 ms
Wall time: 1.77 s


In [19]:
%%time
load_model = False

if load_model:
    # # load model
    word_vec = word2vec.Word2Vec.load('full_word2vec_blob.bin')
    vec_size = word_vec.layer1_size
else: 
    vec_size = 50
    word_vec = word2vec.Word2Vec(
        sentences_blob,
        workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
        min_count=5,  # Minimum word count threshold.
        window=6,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=vec_size,      # Word vector length.
        hs=1           # Use hierarchical softmax.
    )
    
    # save model
    word_vec.save('full_word2vec_blob.bin')

# List of words in model.
vocab = word_vec.wv.vocab.keys()
beep('ping')

CPU times: user 464 ms, sys: 19.7 ms, total: 483 ms
Wall time: 1.35 s


In [20]:
%%time
vec_new = np.array([.5 for i in range(0,vec_size)])
review['vectors'] = review.token.apply(lambda val: [word_vec[w] if w in vocab else vec_new for w in val])
beep('ping')

CPU times: user 209 ms, sys: 5.93 ms, total: 215 ms
Wall time: 1.33 s


### TBD

In [None]:
w1,w2,w3 = 'easy','player','good'
print(word_vec.most_similar(positive=[w1, w2], negative=[w3], topn=1))

w1 = 'easy'
print(word_vec.wv.most_similar(positive=w1,topn=3))

w1 = 'hard'
print(word_vec.wv.most_similar(positive=w1,topn=3))

__Cosine Similarity Function__

$$ cos(\theta) = \frac{A \bullet B} {\Vert A \Vert \Vert B \Vert} =  \frac{\sum_{i=1}^n A_i B_i}{\sqrt{ \sum_{i=1}^n A^2} \sqrt{ \sum_{i=1}^n B^2}} $$

__Version A. Raw Code__

In [None]:
euclidean_norm = lambda m: np.sqrt(np.array([a*a for a in m]).sum())
def similarity_vec(a,b):
    return (np.dot(a,b))/(euclidean_norm(a)*euclidean_norm(b))

hard_easy = similarity_vec(word_vec['hard'],word_vec['easy'])
hard_cat = similarity_vec(word_vec['hard'],word_vec['cat'])
easy_cat = similarity_vec(word_vec['easy'],word_vec['cat'])
easy_simple = similarity_vec(word_vec['easy'],word_vec['simple'])

print('HARD - EASY: {}'.format(hard_easy))
print('HARD - CAT: {}'.format(hard_cat))
print('EASY - CAT: {}'.format(easy_cat))
print('EASY - SIMPLE: {}'.format(easy_simple))

__Version B. SKLearn__

______

******

# 6. Comprehensive Neural Network

_Slow Method_

In [None]:
%%time
padding, max_words = [0 for i in range(0,vec_size)], 100
review.vectors = list(keras.preprocessing.sequence.pad_sequences(review.vectors, 
                                                     maxlen=max_words, 
                                                     padding='post', 
                                                     dtype = 'float',
                                                     truncating='post', 
                                                     value=padding))
beep('ping')

_Fast Method_

In [None]:
%%time

pad, max_words = [0 for i in range(0,vec_size)], 100
def manual_pad(val):
    empty = max_words-len(val)
    for i in range(0,empty):
        val.append(pad)
    
    return [i for i in val[0:max_words+1]]

review.vectors=review.vectors.map(manual_pad)

_Create Train/Test Data_

In [None]:
%%time
y = review['rating']
X = pd.DataFrame([list(i[0]) for i in review.vectors])

X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int).ravel(), test_size=0.33, random_state=42)

#### _C - Keras Sequential NN_

In [None]:
y_test = y_test.ravel()
y_train = y_train.ravel()

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [None]:
X_train.shape

In [None]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(11, activation='softmax'))

In [None]:
batch_size = 5
epochs = 10

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, y_test))

beep('ping')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# Jon's Code
from sklearn.metrics import confusion_matrix

def plot_confusion(y, y_pred, title):
    # rating levels
    ratings = list(range(1,11))

    # generate confusion matrix
    cm = confusion_matrix(y, y_pred)

    # normalize matrix
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # plot matrix
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.magma)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(ratings))
    plt.xticks(tick_marks, ratings, rotation=45)
    plt.yticks(tick_marks, ratings)
    plt.grid(False)
    plt.tight_layout()
    plt.ylabel('Actual rating')
    plt.xlabel('Predicted rating');

In [None]:
y_predict = model.predict(X_test)

In [None]:
from keras.backend import argmax as kargmax
y_predict = np.argmax(model.predict(X_test),axis=1)
final_score = np.argmax(y_test,axis=1)

In [None]:
imblearn

In [None]:
plot_confusion(final_score,y_predict,'Check')

## Word Similarity Visualization

## _Root Mean Squared Error_

$$ RMSE_{y} =  \sqrt{\frac{\sum_{i=1}^n {(\hat{y}_{i}- y_{i})}^2}{N}} $$

In [None]:
RMSE = np.sqrt(np.sum(np.square(np.subtract(y_predict,y_actual)))/len(y_actual))
# print('Root Mean Squared Error: {}'.format(RMSE))

### _Notes_

Mathjacks / Tex