In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('Recommendation_Beauty').getOrCreate()

In [None]:
data = spark.read.json('data/Beauty_5.json')

In [None]:
data.show(5, truncate=True)

+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|7806397051| [3, 4]|    1.0|Very oily and cre...|01 30, 2014|A1YJEY40YUW4SE|      Andrea|Don't waste your ...|    1391040000|
|7806397051| [1, 1]|    3.0|This palette was ...|04 18, 2014| A60XNB876KYML|  Jessica H.|         OK Palette!|    1397779200|
|7806397051| [0, 1]|    4.0|The texture of th...| 09 6, 2013|A3G6XNM240RMWA|       Karen|       great quality|    1378425600|
|7806397051| [2, 2]|    2.0|I really can't te...| 12 8, 2013|A1PQFP6SAJ6D80|       Norah|Do not work on my...|    1386460800|
|7806397051| [0, 0]|    3.0|It was a little s...|10 19, 2013|A38FVHZTNQ271F|   Nova Amor|          It's okay.|    1382

In [None]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [None]:
data_sub.count()

198502

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col

In [None]:
data_sub.show(5, truncate = True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|
|7806397051|    3.0| A60XNB876KYML|
|7806397051|    4.0|A3G6XNM240RMWA|
|7806397051|    2.0|A1PQFP6SAJ6D80|
|7806397051|    3.0|A38FVHZTNQ271F|
+----------+-------+--------------+
only showing top 5 rows



In [None]:
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
                 data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [None]:
# Distinct users and movies
users = data_sub.select('reviewerID').distinct().count()
products = data_sub.select('asin').distinct().count()
numerator = data_sub.count()

In [None]:
display(numerator, users, products)

198502

22363

12101

In [None]:
# Number of rating matrix could contain if no empty cells
denominator = users * products
denominator

270614663

In [None]:
# Calculating sparsity 
sparsity = 1 - (numerator * 1.0 / denominator)
print('Sparsity: ', sparsity)

Sparsity:  0.9992664772935825


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [None]:
# Converting String to index
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = StringIndexer(inputCol = 'asin',
                        outputCol = 'asin_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)

# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)

# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol = 'reviewerID',
                         outputCol = 'reviewerID_idx')
indexer1_model = indexer1.fit(data_indexed)
data_indexed = indexer1_model.transform(data_indexed)

In [None]:
data_indexed.show(5, truncate = True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|  6959.0|       18008.0|
|7806397051|    3.0| A60XNB876KYML|  6959.0|       10825.0|
|7806397051|    4.0|A3G6XNM240RMWA|  6959.0|        5924.0|
|7806397051|    2.0|A1PQFP6SAJ6D80|  6959.0|       12357.0|
|7806397051|    3.0|A38FVHZTNQ271F|  6959.0|        6087.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [None]:
data_indexed.select([count(when(col(c).isNull(),c)).alias (c) for c in 
                    data_indexed.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0
asin_idx,0
reviewerID_idx,0


In [None]:
# Smaller dataset so we will use 0.8/0.2
training, test = data_indexed.randomSplit([0.8,0.2])

In [None]:
# Creating ALS model and fitting data
als = ALS(maxIter = 5, 
          regParam=0.09,
          rank=25, 
          userCol='reviewerID_idx', 
          itemCol = 'asin_idx',
          ratingCol = 'overall',
          coldStartStrategy='drop',
          nonnegative=True)
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [None]:
predictions.select(['asin_idx', 'reviewerID_idx', 
                    'overall', 'prediction']).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   148.0|         385.0|    5.0| 5.5424514|
|   148.0|         236.0|    5.0| 4.1931453|
|   148.0|         330.0|    1.0| 4.1391478|
|   148.0|       19062.0|    5.0| 3.4681797|
|   148.0|        2422.0|    5.0| 3.9887984|
+--------+--------------+-------+----------+
only showing top 5 rows



In [None]:
evaluator = RegressionEvaluator(metricName = 'rmse', 
                                labelCol = 'overall',
                                predictionCol = 'prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 1.3446319135143865


In [None]:
# On average,  this model is ~ 1.2 from perfect recommendations

## Providing Recommendations: for all users

In [None]:
# get 20 recommendations which have highest rating
user_recs = model.recommendForAllUsers(20)

In [None]:
for user in user_recs.head(5):
    print(user)
    print('\n')

Row(reviewerID_idx=1580, recommendations=[Row(asin_idx=6183, rating=8.20853042602539), Row(asin_idx=4274, rating=8.187591552734375), Row(asin_idx=11679, rating=8.116562843322754), Row(asin_idx=6604, rating=8.01150894165039), Row(asin_idx=3527, rating=7.951577186584473), Row(asin_idx=10127, rating=7.948854923248291), Row(asin_idx=8893, rating=7.896306037902832), Row(asin_idx=11655, rating=7.814083576202393), Row(asin_idx=11625, rating=7.74952507019043), Row(asin_idx=9900, rating=7.733482360839844), Row(asin_idx=9940, rating=7.711333274841309), Row(asin_idx=6658, rating=7.684535026550293), Row(asin_idx=5177, rating=7.68311071395874), Row(asin_idx=8232, rating=7.661252021789551), Row(asin_idx=4026, rating=7.659734725952148), Row(asin_idx=8046, rating=7.654044151306152), Row(asin_idx=5519, rating=7.653414249420166), Row(asin_idx=8742, rating=7.584081172943115), Row(asin_idx=11342, rating=7.567073822021484), Row(asin_idx=7885, rating=7.507442474365234)])


Row(reviewerID_idx=4900, recommend

## Converting back to string form

> Indented block



In [None]:
import pandas as pd
recs = model.recommendForAllUsers(10).toPandas()
nrecs = recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(['recommendations'], axis = 1) \
            .melt(id_vars = ['reviewerID_idx'], value_name = 'recommendation') \
            .drop('variable', axis = 1) \
            .dropna()
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs['recommendation'].apply(pd.Series),
                   nrecs['reviewerID_idx']], axis = 1)
nrecs.columns = [
    'ProductID_index',
    'Rating', 
    'UserID_index'
]

In [None]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx',
                          'asin', 'asin_idx'])
md = md.toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sort_values('reviewerID')
nrecs.reset_index(drop = True, inplace = True)
new = nrecs[['reviewerID','asin', 'Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby([res.reviewerID])\
                                .apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00414041RD0BXM6WK0GX,"[(B00HFP4JZU, 4.942602634429932), (B000EVGOS2,..."
1,A00473363TJ8YSZ3YAGG9,"[(B003JMYYQW, 4.40961217880249), (B0016KTQ0I, ..."
2,A00700212KB3K0MVESPIY,"[(B007L5P7YQ, 6.5778093338012695), (B00161IKD6..."
3,A0078719IR14X3NNUG0F,"[(B000VCN6KI, 7.923923492431641), (B000VOHH56,..."
4,A01198201H0E3GHV2Z17I,"[(B000TUB4BU, 6.999694347381592), (B007L8QLU2,..."
...,...,...
22355,AZZNK89PXD006,"[(B00464EC1E, 3.4921135902404785), (B00381A7OW..."
22356,AZZQXL8VDCFTV,"[(B000ZECIDS, 5.694683074951172), (B0029OIWNC,..."
22357,AZZT1ERHBSNQ8,"[(B001FO2GW0, 6.32636833190918), (B00381A7OW, ..."
22358,AZZU6NXB8YJN9,"[(B00HHECHLC, 5.361822605133057), (B0055HYT78,..."
