#### Dataset: Musical_Instruments_5.json
#### Requirement:
* Read dataset
* Pre-processing data
* Use "asin" (ProductID), "reviewerID" and overall (User's reviews for each product - rating) to build model to predict overalls => Give recommendation for users.

In [None]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
import findspark
findspark.init()

In [None]:
import pandas as pd

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, isnull, when, count

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [None]:
# Load data
data = spark.read.json("../../Data/Musical_Instrument_5.json")

In [None]:
data.show(5, truncate=True)

In [None]:
data_sub = data.select(['asin', 'overall', 'reviewerID'])

In [None]:
data_sub.count()

In [None]:
data_sub.show(5, truncate=True)

In [None]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

In [None]:
# District users and movies

In [None]:
users = data_sub.select("reviewerID").distict().count()
products = data_sub.select("asin").distict().count()
numberator = data_sub.count()

In [None]:
display(numberator, users, products)

In [None]:
# Number of rating matrix could contain if no empy cells
denominator = user * products
denominator

In [None]:
# Calculating sparsity
sparsity = 1 - (numberator * 1.0 / denominator)
print("Sparsity:", sparsity)

In [None]:
# Create an indexer
indexer = StringIndexer(inputCol='asin', outputCol='asin_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data_sub)
# Indexer create a new column with numeric index values
data_indexed = indexer_model.transform(data_sub)

In [None]:
# Repeat the process for the other categorical feature
indexer1 = StringIndexer(inputCol='reviewerID', outputCol='reviewerID_idx')
indexer1_model = indexer1.fit(data_sub)
data_indexed = indexer1_model.transform(data_sub)

In [None]:
data_indexed.show(5, truncate=True)

In [None]:
data_sub.select([count(when(isnull(c), c)).alias(c) for c in data_sub.columns]).toPandas().T

In [None]:
# Smaller dataset so we will use 0.8/0.2
training, test = data_indexed.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=5, regParam=0.09, rank=25,
          userCol="reviewerID_idx", itemCol="asin_idx", ratingCol="overall",
          coldStartStrategy="drop", nonnegative=True)
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [None]:
predictions.select(["asin_idx", "reviewerID_idx", "overall", "prediction"]).show(5)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root mean squared error =", str(rmse))

### Providing Recommendations: for all users

In [None]:
# Get 20 recommendations which have highest rating
user_recs = model.recommendForAllUsers(20)

In [None]:
for user in user_recs.head(5):
    print(user)
    print("\n")

### Converting back to string form

In [None]:
recs = model.recommendForAllUsers(10).toPandas()
nrecs = recs.recommendations.apply(pd.Series)
                            .merge(recs, right_index=True, left_index=True)
                            .melt(id_vars=["reviewerID_idx"], value_name="recommendation")
                            .drop(["recommendations"], axis=1)
                            .dropna()
nrecs = nrecs.sort_values('reviewerID_idx')
nrecs = pd.concat([nrecs["recommendations"].apply(pd.Series),
                   nrecs["reviewerID_idx"]], axis=1)
nrecs.columns = ['ProductID_index', 'Rating', 'UserID_index']

In [None]:
md = data_indexed.select(['reviewerID', 'reviewerID_idx', 'asin', 'asin_idx'])
md = md.toPandas()
dict1 = dict(zip(md['reviewerID_idx'], md['reviewerID']))
dict2 = dict(zip(md['asin_idx'], md['asin']))
nrecs['reviewerID'] = nrecs['UserID_index'].map(dict1)
nrecs['asin'] = nrecs['ProductID_index'].map(dict2)
nrecs = nrecs.sortValues('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new = nrecs[['reviewerId', 'asin', 'Rating']]
new['recommendations'] = list(zip(new.asin, new.Rating))
res = new[['reviewerID', 'recommendations']]
res_new = res['recommendations'].groupby(['res.reviewerId']).apply(list).reset_index()

In [None]:
res_new