#### Dataset: Beauty_5.json
#### Requirement:
* Read dataset
* Pre-processing data
* Use "asin" (ProductID), "reviewerID" and overall (User's reviews for each product - rating) to build model to predict overalls => Give recommendation for users.

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, isnull, when, count

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [6]:
# Load data
data = spark.read.json("../../Data/Beauty_5.json")

In [7]:
data.show(5, truncate=True)

+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------+--------------------+--------------+
|7806397051| [3, 4]|    1.0|Very oily and cre...|01 30, 2014|A1YJEY40YUW4SE|      Andrea|Don't waste your ...|    1391040000|
|7806397051| [1, 1]|    3.0|This palette was ...|04 18, 2014| A60XNB876KYML|  Jessica H.|         OK Palette!|    1397779200|
|7806397051| [0, 1]|    4.0|The texture of th...| 09 6, 2013|A3G6XNM240RMWA|       Karen|       great quality|    1378425600|
|7806397051| [2, 2]|    2.0|I really can't te...| 12 8, 2013|A1PQFP6SAJ6D80|       Norah|Do not work on my...|    1386460800|
|7806397051| [0, 0]|    3.0|It was a little s...|10 19, 2013|A38FVHZTNQ271F|   Nova Amor|          It's okay.|    1382

In [8]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [9]:
data_sub.count()

198502

In [10]:
data_sub.show(5, truncate=True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|
|7806397051|    3.0| A60XNB876KYML|
|7806397051|    4.0|A3G6XNM240RMWA|
|7806397051|    2.0|A1PQFP6SAJ6D80|
|7806397051|    3.0|A38FVHZTNQ271F|
+----------+-------+--------------+
only showing top 5 rows



In [11]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [12]:
# District users and movies

In [13]:
users = data_sub.select("reviewerID").distinct().count()
products = data_sub.select("asin").distinct().count()
numberator = data_sub.count()

In [14]:
display(numberator, users, products)

198502

22363

12101

In [15]:
# Number of rating matrix could contain if no empy cells
denominator = users * products
denominator

270614663

In [16]:
# Calculating sparsity
sparsity = 1 - (numberator * 1.0 / denominator)
print("Sparsity:", sparsity)

Sparsity: 0.9992664772935825


In [17]:
# Create an indexer
indexer = StringIndexer(inputCol='asin', outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer create a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)

In [18]:
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_sub)
data_indexed = indexer1_model.transform(data_indexed)

In [19]:
data_indexed.show(5, truncate=True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|7806397051|    1.0|A1YJEY40YUW4SE|  6194.0|       16983.0|
|7806397051|    3.0| A60XNB876KYML|  6194.0|       10399.0|
|7806397051|    4.0|A3G6XNM240RMWA|  6194.0|        5985.0|
|7806397051|    2.0|A1PQFP6SAJ6D80|  6194.0|       11765.0|
|7806397051|    3.0|A38FVHZTNQ271F|  6194.0|        5910.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [20]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [21]:
# Smaller dataset so we will use 0.8/0.2
training, test = data_indexed.randomSplit([0.8, 0.2])

In [22]:
als = ALS(maxIter=5, regParam=0.09, rank=25,
          userCol="reviewerID_idx", itemCol="asin_idx", ratingCol="overall",
          coldStartStrategy="drop", nonnegative=True)
model = als.fit(training)

In [23]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [24]:
predictions.select(["asin_idx", "reviewerID_idx", "overall", "prediction"]).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   148.0|          31.0|    2.0|  3.669877|
|   148.0|          34.0|    5.0| 3.3353481|
|   148.0|        6288.0|    3.0| 1.6924319|
|   148.0|          27.0|    5.0| 4.3108854|
|   148.0|           1.0|    5.0| 3.6594493|
+--------+--------------+-------+----------+
only showing top 5 rows



In [25]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root mean squared error =", str(rmse))

Root mean squared error = 1.3395595305029961


### Providing Recommendations: for all users

In [26]:
# Get 20 recommendations which have highest rating
user_recs = model.recommendForAllUsers(20)

In [27]:
for user in user_recs.head(5):
    print(user)
    print("\n")

Row(reviewerID_idx=1580, recommendations=[Row(asin_idx=11293, rating=5.867204666137695), Row(asin_idx=5561, rating=5.8324689865112305), Row(asin_idx=4606, rating=5.80586051940918), Row(asin_idx=10231, rating=5.701236248016357), Row(asin_idx=4896, rating=5.68094539642334), Row(asin_idx=9900, rating=5.666428565979004), Row(asin_idx=7372, rating=5.644848346710205), Row(asin_idx=8498, rating=5.616497039794922), Row(asin_idx=8172, rating=5.5590996742248535), Row(asin_idx=9657, rating=5.503898620605469), Row(asin_idx=7098, rating=5.494077205657959), Row(asin_idx=11166, rating=5.484093189239502), Row(asin_idx=8408, rating=5.4612860679626465), Row(asin_idx=5604, rating=5.460039138793945), Row(asin_idx=2810, rating=5.449482440948486), Row(asin_idx=6104, rating=5.441318511962891), Row(asin_idx=4073, rating=5.438269138336182), Row(asin_idx=11233, rating=5.4266676902771), Row(asin_idx=8962, rating=5.420121192932129), Row(asin_idx=4649, rating=5.410305976867676)])


Row(reviewerID_idx=4900, recomme

### Converting back to string form

In [28]:
recs = model.recommendForAllUsers(10).toPandas()
nrecs = recs['recommendations'].apply(pd.Series).merge(recs, right_index=True, left_index=True).drop(["recommendations"], axis=1).melt(id_vars=["reviewerID_idx"], value_name="recommendation").drop(["variable"], axis=1).dropna()
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs["recommendation"].apply(pd.Series),
                   nrecs["reviewerID_idx"]], axis=1)
nrecs.columns = ['ProductID_index', 'Rating', 'UserID_index']

In [29]:
nrecs

Unnamed: 0,ProductID_index,Rating,UserID_index
88494,11407.0,6.288699,0
155580,6077.0,6.166509,0
66132,11166.0,6.325033,0
133218,9981.0,6.197639,0
222666,7780.0,6.043042,0
...,...,...,...
152129,10564.0,5.987124,22362
129767,12069.0,6.164086,22362
219215,9900.0,5.398891,22362
62681,12067.0,6.164173,22362


In [30]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md = md.toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new = nrecs[['reviewerID', 'asin', 'Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby(res['reviewerID']).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [31]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00414041RD0BXM6WK0GX,"(B00HFP4JZU, 4.88042688369751)"
1,A00414041RD0BXM6WK0GX,"(B00GRTQBTM, 4.794581890106201)"
2,A00414041RD0BXM6WK0GX,"(B00319V9BG, 4.843811988830566)"
3,A00414041RD0BXM6WK0GX,"(B0009MMK5M, 4.4037885665893555)"
4,A00414041RD0BXM6WK0GX,"(B0002X4F0Q, 4.534732818603516)"
...,...,...
223615,AZZZLM1E5JJ8C,"(B003Y69DJG, 4.763320446014404)"
223616,AZZZLM1E5JJ8C,"(B00ANTO6YO, 4.809251308441162)"
223617,AZZZLM1E5JJ8C,"(B000X1LOJM, 5.338888645172119)"
223618,AZZZLM1E5JJ8C,"(B003WG3AQ0, 5.041476249694824)"
