#### Dataset: Musical_Instruments_5.json
#### Requirement:
* Read dataset
* Pre-processing data
* Use "asin" (ProductID), "reviewerID" and overall (User's reviews for each product - rating) to build model to predict overalls => Give recommendation for users.

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, isnull, when, count

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [6]:
# Load data
data = spark.read.json("../../Data/Musical_Instruments_5.json")

In [7]:
data.show(5, truncate=True)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [8]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [9]:
data_sub.count()

10261

In [10]:
data_sub.show(5, truncate=True)

+----------+-------+--------------+
|      asin|overall|    reviewerID|
+----------+-------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|
|1384719342|    5.0|A14VAT5EAX3D9S|
|1384719342|    5.0|A195EZSQDW3E21|
|1384719342|    5.0|A2C00NNG1ZQQG2|
|1384719342|    5.0| A94QU4C90B1AX|
+----------+-------+--------------+
only showing top 5 rows



In [11]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [12]:
# District users and movies
users = data_sub.select("reviewerID").distinct().count()
products = data_sub.select("asin").distinct().count()
numberator = data_sub.count()

In [13]:
display(numberator, users, products)

10261

1429

900

In [14]:
# Number of rating matrix could contain if no empy cells
denominator = users * products
denominator

1286100

In [15]:
# Calculating sparsity
sparsity = 1 - (numberator * 1.0 / denominator)
print("Sparsity:", sparsity)

Sparsity: 0.992021615737501


In [16]:
# Create an indexer
indexer = StringIndexer(inputCol='asin', outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer create a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)

In [17]:
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_sub)
data_indexed = indexer1_model.transform(data_indexed)

In [18]:
data_indexed.show(5, truncate=True)

+----------+-------+--------------+--------+--------------+
|      asin|overall|    reviewerID|asin_idx|reviewerID_idx|
+----------+-------+--------------+--------+--------------+
|1384719342|    5.0|A2IBPI20UZIR0U|   703.0|          66.0|
|1384719342|    5.0|A14VAT5EAX3D9S|   703.0|         266.0|
|1384719342|    5.0|A195EZSQDW3E21|   703.0|         395.0|
|1384719342|    5.0|A2C00NNG1ZQQG2|   703.0|        1048.0|
|1384719342|    5.0| A94QU4C90B1AX|   703.0|        1311.0|
+----------+-------+--------------+--------+--------------+
only showing top 5 rows



In [19]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

Unnamed: 0,0
asin,0
overall,0
reviewerID,0


In [20]:
# Smaller dataset so we will use 0.8/0.2
training, test = data_indexed.randomSplit([0.8, 0.2])

In [21]:
als = ALS(maxIter=5, regParam=0.09, rank=25,
          userCol="reviewerID_idx", itemCol="asin_idx", ratingCol="overall",
          coldStartStrategy="drop", nonnegative=True)
model = als.fit(training)

In [22]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [23]:
predictions.select(["asin_idx", "reviewerID_idx", "overall", "prediction"]).show(5)

+--------+--------------+-------+----------+
|asin_idx|reviewerID_idx|overall|prediction|
+--------+--------------+-------+----------+
|   148.0|         258.0|    5.0| 4.8357267|
|   148.0|        1181.0|    5.0| 3.9716198|
|   148.0|        1117.0|    5.0| 4.1916695|
|   496.0|        1208.0|    4.0| 4.1645975|
|   496.0|           3.0|    5.0| 4.1181383|
+--------+--------------+-------+----------+
only showing top 5 rows



In [24]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root mean squared error =", str(rmse))

Root mean squared error = 1.2170430053970474


### Providing Recommendations: for all users

In [25]:
# Get 20 recommendations which have highest rating
user_recs = model.recommendForAllUsers(20)

In [26]:
for user in user_recs.head(5):
    print(user)
    print("\n")

Row(reviewerID_idx=471, recommendations=[Row(asin_idx=634, rating=5.561305522918701), Row(asin_idx=870, rating=5.550746917724609), Row(asin_idx=620, rating=5.461918354034424), Row(asin_idx=838, rating=5.454563140869141), Row(asin_idx=48, rating=5.441943168640137), Row(asin_idx=881, rating=5.432235240936279), Row(asin_idx=610, rating=5.429994583129883), Row(asin_idx=886, rating=5.428886413574219), Row(asin_idx=733, rating=5.380276203155518), Row(asin_idx=669, rating=5.374698162078857), Row(asin_idx=748, rating=5.291914939880371), Row(asin_idx=829, rating=5.263034820556641), Row(asin_idx=503, rating=5.250359058380127), Row(asin_idx=202, rating=5.247001647949219), Row(asin_idx=703, rating=5.223143100738525), Row(asin_idx=218, rating=5.212152481079102), Row(asin_idx=878, rating=5.203555107116699), Row(asin_idx=410, rating=5.1761884689331055), Row(asin_idx=485, rating=5.174846649169922), Row(asin_idx=584, rating=5.167128086090088)])


Row(reviewerID_idx=1342, recommendations=[Row(asin_idx=5

### Converting back to string form

In [27]:
recs = model.recommendForAllUsers(10).toPandas()
nrecs = recs['recommendations'].apply(pd.Series).merge(recs, right_index=True, left_index=True).drop(["recommendations"], axis=1).melt(id_vars=["reviewerID_idx"], value_name="recommendation").drop(["variable"], axis=1).dropna()
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs["recommendation"].apply(pd.Series),
                   nrecs["reviewerID_idx"]], axis=1)
nrecs.columns = ['ProductID_index', 'Rating', 'UserID_index']

In [30]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md = md.toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new = nrecs[['reviewerID', 'asin', 'Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby(res['reviewerID']).apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['recommendations'] = list(zip(new.asin, new.Rating))


In [31]:
res_new

Unnamed: 0,reviewerID,recommendations
0,A00625243BI8W1SSZNLMD,"[(B0009EU01G, 5.520554542541504), (B000RYPN38,..."
1,A10044ECXDUVKS,"[(B000SZVYLQ, 5.274088382720947), (B0002GXRF2,..."
2,A102MU6ZC9H1N6,"[(B000RYPN38, 5.831742286682129), (B004FODY6E,..."
3,A109JTUZXO61UY,"[(B0002GJ3ZA, 6.912607192993164), (B0002GXRF2,..."
4,A109ME7C09HM2M,"[(B0002GXRF2, 5.714378356933594), (B003S3S0DU,..."
...,...,...
1423,AZJPNK73JF3XP,"[(B001C9R5P6, 6.2771501541137695), (B0002M728Y..."
1424,AZMHABTPXVLG3,"[(B000W00X1Y, 4.890178680419922), (B000XPRSTI,..."
1425,AZMIKIG4BB6BZ,"[(B0002GJ3ZA, 6.146444797515869), (B001C9R5P6,..."
1426,AZPDO6FLSMLFP,"[(B001NXDSK2, 5.151345252990723), (B0002D0CA8,..."
