# Download and process ratings data

In [75]:
import pandas as pd
import gzip
import numpy as np
from sklearn import metrics, preprocessing
from sklearn.model_selection import cross_validate

In [76]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

ratings = getDF('reviews_Beauty_5.json.gz')

In [80]:
ratings.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013"


In [81]:
ratings.rename(columns={'reviewerID': 'user_id', 
                        'asin': 'item_id', 
                        'reviewerName': 'user_name', 
                        'reviewText': 'review_text',
                        'summary': 'review_summary',
                        'overall': 'score'},
               inplace=True)

In [82]:
ratings.user_id = ratings.user_id.astype('category').cat.codes.values
ratings.item_id = ratings.item_id.astype('category').cat.codes.values
# Add IDs for embeddings.
ratings['user_emb_id'] = ratings['user_id']
ratings['item_emb_id'] = ratings['item_id']

In [83]:
ratings.head()

Unnamed: 0,user_id,item_id,user_name,helpful,review_text,score,review_summary,unixReviewTime,reviewTime,user_emb_id,item_emb_id
0,5584,0,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014",5584,0
1,17504,0,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014",17504,0
2,14499,0,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013",14499,0
3,4157,0,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013",4157,0
4,13219,0,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013",13219,0


# Text embeddings and classifier

In [84]:
!pip install spacy



In [85]:
# python -m spacy download en

In [96]:
import spacy
nlp_en = spacy.load('en', vectors='en_glove_cc_300_1m')
example_reviews = ['this creme is amazing', 'creme amazing', 'this shoe does not fit; it hurts', 'shoe hurts']
example_embs = np.vstack([nlp_en(e).vector for e in example_reviews])
sklearn.metrics.pairwise.cosine_similarity(example_embs)

array([[1.0000001 , 0.6532452 , 0.5040008 , 0.3325375 ],
       [0.6532452 , 1.        , 0.33373064, 0.58611566],
       [0.5040008 , 0.33373064, 1.        , 0.5160885 ],
       [0.3325375 , 0.58611566, 0.5160885 , 0.99999976]], dtype=float32)

In [87]:
df = ratings[['review_summary']]
summary_embs = np.vstack([nlp_en(e).vector for e in df])
sklearn.metrics.pairwise.cosine_similarity(summary_embs)

array([[1.]], dtype=float32)

In [100]:
summary_embs

array([[-0.13157356, -1.2666644 , -2.8043616 , -2.404192  , -2.4721403 ,
         3.181284  ,  3.8876255 ,  0.47620222, -2.429509  , -1.5870346 ,
         1.0722339 , -1.2931393 , -1.7688907 , -5.966107  , -3.45097   ,
         1.6881995 ,  1.0376529 ,  3.0942085 , -0.16541037,  1.3513762 ,
        -3.4355297 , -3.4842682 ,  0.92572904,  3.117887  , -1.777405  ,
         2.264823  ,  0.6075816 , -0.3395865 ,  3.063789  ,  9.572119  ,
        -1.1162163 ,  0.71857727,  0.5801405 , -1.5756882 ,  0.91073084,
         1.6807712 , -0.6482867 ,  1.0755434 ,  1.0377249 , -2.5246587 ,
        -0.42541802,  2.802749  ,  1.4728252 , -0.1633364 , -0.40625015,
        -0.23787445,  0.36663842, -1.4493179 , -3.9459581 , -0.31721222,
        -5.5970774 ,  1.0920272 , -0.26114047, -3.103499  , -0.1538702 ,
        -2.5996077 ,  3.6767077 ,  2.1184425 , -2.9120955 , -0.83487684,
         0.20317447, -1.0136617 ,  0.04501563,  2.2129858 , -0.3876744 ,
        -0.9729902 ,  2.4910083 , -0.39668816, -0.9

In [101]:
#Classification

In [105]:
from sklearn.model_selection import train_test_split
x = ratings[['score']], 
y= summary_embs
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state= 50)

In [107]:
from sklearn.neighbors import KNeighborsClassifier
import sys

In [115]:
df = KNeighborsClassifier(n_neighbors = )
cv_results = cross_validate(y, X_train, y_train, cv=10, scoring='roc_auc',return_train_score=False)

SyntaxError: invalid syntax (<ipython-input-115-f28a49458714>, line 1)

# Recommender system

In [122]:
import keras as ks
from tensorrec import TensorRec
from tensorrec.representation_graphs import AbstractKerasRepresentationGraph

In [123]:
class DeepRepresentationGraph(AbstractKerasRepresentationGraph):
    def create_layers(self, n_features, n_components):
        return [
            ks.layers.Dense(n_components * 16, activation='relu'),
            ks.layers.Dense(n_components * 8, activation='relu'),
            ks.layers.Dense(n_components * 2, activation='relu'),
            ks.layers.Dense(n_components, activation='tanh'),
        ]

In [124]:
# My new recommender system with two deep nets
model = TensorRec(item_repr_graph=DeepRepresentationGraph(),
                  user_repr_graph=DeepRepresentationGraph())