### Reference
- https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65
- https://pypi.org/project/python-amazon-simple-product-api/
- https://github.com/benfred/implicit
- https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3
- https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-2-alternating-least-square-als-matrix-4a76c58714a1

## Data format
- Format is one-review-per-line in json. See examples below for further help reading the data.

    - reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    - asin - ID of the product, e.g. 0000013714
    - reviewerName - name of the reviewer
    - vote - helpful votes of the review
    - style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
    - reviewText - text of the review
    - overall - rating of the product
    - summary - summary of the review
    - unixReviewTime - time of the review (unix time)
    - reviewTime - time of the review (raw)
    - image - images that users post after they have received the produc

In [1]:
#### To measure all running time
# https://github.com/cpcloud/ipython-autotime

%load_ext autotime

In [2]:
import gc

collected = gc.collect()
print ("Garbage collector: collected %d objects." % collected)

Garbage collector: collected 60 objects.
time: 14.6 ms


In [3]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

time: 517 ms


In [29]:
import os
import time
import tqdm
import codecs

# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, explode, desc
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.evaluation import RegressionEvaluator

# data science imports
import math

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

import json

%matplotlib inline

time: 184 ms


In [5]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

time: 484 µs


In [None]:
number_cores = 8
memory_gb = 32

spark = SparkSession \
    .builder \
    .appName("amazon recommendation") \
    .config("spark.driver.memory", '{}g'.format(memory_gb)) \
    .config("spark.master", 'local[{}]'.format(number_cores)) \
    .getOrCreate()

# get spark context
sc = spark.sparkContext

- Download dataset from: http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Clothing_Shoes_and_Jewelry.json.gz

In [None]:
DATA_PATH = '../../Data_fulldata/Review/ClothingShoesAndJewelry/'
REVIEW_DATA = 'Clothing_Shoes_and_Jewelry.json.gz'

In [None]:
!ls ../../Data_fulldata/Review/ClothingShoesAndJewelry/

1. Please unzip Clothing_Shoes_and_Jewelry.json.gz to Clothing_Shoes_and_Jewelry.json
2. Load Clothing_Shoes_and_Jewelry.json (14.1 GB (14,144,939,923 bytes))

In [None]:
ratings = spark.read.load(DATA_PATH+REVIEW_DATA, format='json', header=True, inferSchema=True)

In [None]:
ratings.show(3)

In [None]:
type(ratings)

In [None]:
# print("Shape of Data", (ratings.count(), len(ratings.columns)))

#### Extract ['asin', 'overall', 'reviewerID'] from dataset

In [None]:
ratings.columns

In [None]:
product_ratings = ratings.drop(
 'image',
 'reviewText',
 'reviewTime',
 'reviewerName',
 'style',
 'summary',
 'unixReviewTime',
 'verified',
 'vote')

In [None]:
product_ratings.show()

In [None]:
type(product_ratings)

#### Convert pyspark.sql.dataframe.DataFrame to Pandas dataframe

In [None]:
# rating_df = product_ratings.toPandas()

- make csv file

In [None]:
product_ratings.write.csv("./data/asin_overall_reviewerID.csv")

In [None]:
!ls -al ./data/

#### Load dataset 

In [6]:
rating_df = pd.read_csv('./data/asin_overall_reviewerID.csv/part-00000-6ef94642-3c25-4f7d-ade9-981f91953b81-c000.csv',
                        names=['asin', 'overall', 'reviewerID'])

time: 18.8 s


In [7]:
rating_df.head(n=10)

Unnamed: 0,asin,overall,reviewerID
0,871167042,5.0,A2IC3NZN488KWK
1,871167042,4.0,A3OT9BYASFGU2X
2,871167042,5.0,A28GK1G2KDXHRP
3,871167042,5.0,A3NFXFEKW8OK0E
4,871167042,5.0,A3I6G5TKBVJEK9
5,871167042,5.0,A1A7Y1M8AJWNZ8
6,871167042,5.0,A30FG02C424EJ5
7,871167042,5.0,ADQQYU1UCDEWB
8,871167042,5.0,A39YL2NXZORK56
9,871167042,5.0,A2PRY50ZESF1MH


time: 8.34 ms


In [8]:
rating_df['overall'].value_counts()

5.0    19525093
4.0     5707951
3.0     2982765
1.0     2271737
2.0     1804553
Name: overall, dtype: int64

time: 241 ms


In [9]:
# rating_df.groupby(['reviewerID', 'asin']).sum()

time: 352 µs


- Drop duplicated records.
- Group overall together with reviwerID and asin.

In [10]:
rating_df = rating_df.drop_duplicates()
grouped_df = rating_df.groupby(['reviewerID', 'asin']).sum().reset_index()
grouped_df.head(10)

Unnamed: 0,reviewerID,asin,overall
0,A0000040I1OM9N4SGBD8,B00NX2IHS4,2.0
1,A0000040I1OM9N4SGBD8,B01136O82A,5.0
2,A0000040I1OM9N4SGBD8,B0183QBP4M,5.0
3,A0000378ZNUHTQUDNNHR,B017M5JE16,5.0
4,A0000448ZD4QU0AQCOH8,B00A21CKO6,5.0
5,A00008882A0PUVHCTDUP,B00AMIQ64E,5.0
6,A0000932YCOC06EWVVQY,B00LGY1904,5.0
7,A0000966VPR3PHG0J8GV,B00H974IAA,3.0
8,A0001080NID4WWYB32VT,B00AFD6N3K,5.0
9,A0001170GCHUTHLVFXBQ,B006OR711Y,4.0


time: 1min 17s


In [11]:
grouped_df.dtypes

reviewerID     object
asin           object
overall       float64
dtype: object

time: 2.03 ms


In [12]:
grouped_df['reviewerID_encode'] = grouped_df['reviewerID'].astype("category")
grouped_df['asin_encode'] = grouped_df['asin'].astype("category")
grouped_df['reviewerID_encode'] = grouped_df['reviewerID_encode'].cat.codes
grouped_df['asin_encode'] = grouped_df['asin_encode'].cat.codes
grouped_df = grouped_df[['reviewerID','reviewerID_encode', 'asin', 'asin_encode', 'overall']]

sparse_content_person = sparse.csr_matrix(
    (grouped_df['overall'].astype(float), 
    (grouped_df['asin_encode'], grouped_df['reviewerID_encode']))
)

sparse_person_content = sparse.csr_matrix(
    (grouped_df['overall'].astype(float), 
    (grouped_df['reviewerID_encode'], grouped_df['asin_encode']))
)

model = implicit.als.AlternatingLeastSquares(
    factors=20, 
    regularization=0.1, 
    iterations=50, 
    use_gpu=True)

alpha = 15
data = (sparse_content_person * alpha).astype('double')

model.fit(data)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


time: 1min 19s


In [13]:
grouped_df

Unnamed: 0,reviewerID,reviewerID_encode,asin,asin_encode,overall
0,A0000040I1OM9N4SGBD8,0,B00NX2IHS4,1564263,2.0
1,A0000040I1OM9N4SGBD8,0,B01136O82A,2020823,5.0
2,A0000040I1OM9N4SGBD8,0,B0183QBP4M,2279962,5.0
3,A0000378ZNUHTQUDNNHR,1,B017M5JE16,2255212,5.0
4,A0000448ZD4QU0AQCOH8,2,B00A21CKO6,674036,5.0
...,...,...,...,...,...
31663531,AZZZZJYGA32,12483675,B00FDK84X2,1032039,5.0
31663532,AZZZZS162JNL0,12483676,B001HTQA0W,120539,5.0
31663533,AZZZZS162JNL0,12483676,B00IVVJ1AA,1254215,5.0
31663534,AZZZZS162JNL0,12483676,B01636KA86,2187337,5.0


time: 8.87 ms


- Labling encoding asin

### Recommend ASIN(Products) based on product

In [14]:
asin='B00NX2IHS4'

asin_encode = grouped_df.loc[grouped_df['asin'] == asin].iloc[0].asin_encode
print("Covnert asin: %s to encoded asin: %d" %(asin, asin_encode))

Covnert asin: B00NX2IHS4 to encoded asin: 1564263
time: 3.04 s


In [15]:
n_similar = 20

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[asin_encode]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[asin_encode]), key=lambda x: -x[1])

for content in similar:
    idx, score = content
    print("Encoded ASIN: %d" %(idx), 
          "| Simility Score: %.5f" %(round(score, 5)), 
          "| https://www.amazon.com/dp/"+grouped_df.asin.loc[grouped_df.asin_encode == idx].iloc[0])
#     print("\n")

Encoded ASIN: 1564263 | Simility Score: 1.00000 | https://www.amazon.com/dp/B00NX2IHS4
Encoded ASIN: 1531965 | Simility Score: 0.98331 | https://www.amazon.com/dp/B00NF12DU6
Encoded ASIN: 924900 | Simility Score: 0.98327 | https://www.amazon.com/dp/B00DQI1SK4
Encoded ASIN: 1832164 | Simility Score: 0.95290 | https://www.amazon.com/dp/B00UXZMQX6
Encoded ASIN: 2531622 | Simility Score: 0.95123 | https://www.amazon.com/dp/B01DYN67DI
Encoded ASIN: 1070131 | Simility Score: 0.95093 | https://www.amazon.com/dp/B00G0IVJAG
Encoded ASIN: 2074993 | Simility Score: 0.94650 | https://www.amazon.com/dp/B013DP3H0S
Encoded ASIN: 2192881 | Simility Score: 0.94549 | https://www.amazon.com/dp/B016886BI2
Encoded ASIN: 991875 | Simility Score: 0.94481 | https://www.amazon.com/dp/B00EOSGPNK
Encoded ASIN: 1635252 | Simility Score: 0.94459 | https://www.amazon.com/dp/B00PFZ05XW
Encoded ASIN: 2563515 | Simility Score: 0.94192 | https://www.amazon.com/dp/B01EOLPJXS
Encoded ASIN: 1321296 | Simility Score: 0.941

In [16]:
# grouped_df.loc[grouped_df['person_id'] == 50].sort_values(by=['eventStrength'], ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

time: 201 µs


In [46]:
grouped_df.asin.loc[grouped_df.asin_encode == 1564263].iloc[0]

'B00NX2IHS4'

time: 22.7 ms


In [None]:
n_similar = 20
output_filename='product_based_recommend.tsv'

person_vecs = model.user_factors
content_vecs = model.item_factors

asin_encode_list = grouped_df['asin_encode'].tolist()

with tqdm.tqdm(total=len(asin_encode_list)) as progress:
    with codecs.open(output_filename, "w", "utf8") as o:
        for asin_encode in asin_encode_list:
        #     print(asin_encode)
            content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

            scores = content_vecs.dot(content_vecs[asin_encode]) / content_norms
            top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
            similar = sorted(zip(top_idx, scores[top_idx] / content_norms[asin_encode]), key=lambda x: -x[1])

            input_asin =""
            for content in similar:
                idx, score = content
                asin = grouped_df.asin.loc[grouped_df.asin_encode == idx].iloc[0]
                
                if round(score, 5)==1.00000:
#                     print(round(score))
                    input_asin = grouped_df.asin.loc[grouped_df.asin_encode == idx].iloc[0]
                o.write("%s\t%s\t%.5f\t%s\n" % (input_asin, asin, round(score, 5), "https://www.amazon.com/dp/"+asin))
#                 print(input_asin)

#                 print("Encoded ASIN: %d" %(idx), 
#                       "| Simility Score: %.5f" %(round(score, 5)), 
#                       "| https://www.amazon.com/dp/"+grouped_df.asin.loc[grouped_df.asin_encode == idx].iloc[0])
#             print("\n")
            progress.update(1)

  0%|          | 3674/31663536 [28:07<4977:30:49,  1.77it/s]

### Recommend ASIN(Products) to Persons
- The following function will return the top 10 recommendations chosen based on the person / content vectors for contents never interacted with for any given person.

In [17]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[asin_encode,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[asin_encode,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    asin_list = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        asin_list.append("https://www.amazon.com/dp/"+grouped_df.asin.loc[grouped_df.asin_encode == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'ASIN': asin_list, 'SCORE': scores})

    return recommendations
    


time: 1.43 ms


In [18]:
# Create recommendations for person
reviewerID="A0000040I1OM9N4SGBD8"
reviewerID_encode = grouped_df.loc[grouped_df['reviewerID'] == reviewerID].iloc[0].asin_encode
print("Covnert reviewerID: %s to encoded reviewerID: %d" %(reviewerID, reviewerID_encode))

Covnert reviewerID: A0000040I1OM9N4SGBD8 to encoded reviewerID: 1564263
time: 2.43 s


In [19]:
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

person_id = reviewerID_encode

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print("\n** Recommended list for reviewer:", reviewerID)
print()
print(recommendations)


** Recommended list for reviewer: A0000040I1OM9N4SGBD8

                                   ASIN     SCORE
0  https://www.amazon.com/dp/B004QL5K22  1.000000
1  https://www.amazon.com/dp/B00JP6DIW2  0.911267
2  https://www.amazon.com/dp/B000E48DCO  0.911195
3  https://www.amazon.com/dp/B00152XP5E  0.887067
4  https://www.amazon.com/dp/B000MQYJ3Q  0.883714
5  https://www.amazon.com/dp/B005FI28S4  0.870804
6  https://www.amazon.com/dp/B005934WR0  0.867594
7  https://www.amazon.com/dp/B000J4B3TE  0.830848
8  https://www.amazon.com/dp/B0002USCE4  0.815416
9  https://www.amazon.com/dp/B0002USBB8  0.815330
time: 14.6 s


#### Here we have top recommendations for reviewerID="A0000040I1OM9N4SGBD8". 


In [20]:
grouped_df.loc[grouped_df['reviewerID'] == 'A0000040I1OM9N4SGBD8'].sort_values(by=['overall'], ascending=False)[['asin', 'reviewerID', 'overall']]

Unnamed: 0,asin,reviewerID,overall
1,B01136O82A,A0000040I1OM9N4SGBD8,5.0
2,B0183QBP4M,A0000040I1OM9N4SGBD8,5.0
0,B00NX2IHS4,A0000040I1OM9N4SGBD8,2.0


time: 2.54 s


## Evaluation the Recommender System 
- https://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/RecEngine_NB.ipynb

In [None]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [None]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [None]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [None]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [None]:
calc_mean_auc(content_train, content_persons_altered,
              [person_vecs, content_vecs.T], content_test)