In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col 
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

Load dataset

In [3]:
spark = SparkSession.builder.appName('recommender').getOrCreate()
spark
df = spark.read.json('movies.json')
df.show()

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This movie needed...|1164844800|A1I7QGUDP043DG|
|        1/1|B003AI2VGA|        golgotha.gov|THE VIRGIN OF JUA...|  3.0|distantly based o...|1197158400|A1M5405JH9THP9|
|        1/1|B003AI2VGA|KerrLines "&#34;M...|Informationally, ...|  3.0|"What's going on ...|1188345600| ATXL536YX71TR|
|        0/0|B003AI2VGA|abra "a devoted 

In [4]:
df_ratings = df.select('user_id','product_id','score')
df_ratings.show(5) 

+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



In [13]:
indexers = [StringIndexer(inputCol= col_name,outputCol=col_name+'_indexed').fit(df_ratings) for col_name in df_ratings.columns[:-1]]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df_ratings).transform(df_ratings)
df_indexed.show(5)
train,test = df_indexed.randomSplit([0.9,0.1])
als = ALS(userCol='user_id_indexed',
          itemCol='product_id_indexed',
          ratingCol='score',
          rank=10,
          regParam=0.01,
          coldStartStrategy='drop')
model = als.fit(train)
preds = model.transform(test)
preds.show(5)
evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='score',metricName='rmse')
rmse = evaluator.evaluate(preds)
print(f"RMSE: {rmse}")

+--------------+----------+-----+---------------+------------------+
|       user_id|product_id|score|user_id_indexed|product_id_indexed|
+--------------+----------+-----+---------------+------------------+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|           32.0|             731.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|            3.0|             731.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|          312.0|             731.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|        10917.0|             731.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|          173.0|             731.0|
+--------------+----------+-----+---------------+------------------+
only showing top 5 rows

+--------------+----------+-----+---------------+------------------+----------+
|       user_id|product_id|score|user_id_indexed|product_id_indexed|prediction|
+--------------+----------+-----+---------------+------------------+----------+
|A1OS2HX2CWQIC7|6303257933|  4.0|          833.0|              83.0|-0.8053398|
|A1V3TRGWOMA8LC|B002OHDRF2|  1.0| 

In [14]:
userRecs = model.recommendForAllUsers(5)
userRecs.show()

+---------------+--------------------+
|user_id_indexed|     recommendations|
+---------------+--------------------+
|             31|[{513, 27.609352}...|
|             53|[{853, 32.8179}, ...|
|             65|[{700, 40.973354}...|
|             78|[{739, 24.476349}...|
|             85|[{822, 22.404781}...|
|            133|[{776, 18.434097}...|
|            137|[{586, 52.78732},...|
|            148|[{447, 18.512798}...|
|            243|[{340, 23.251925}...|
|            251|[{413, 25.723671}...|
|            255|[{739, 30.160769}...|
|            296|[{739, 40.93587},...|
|            321|[{580, 14.818803}...|
|            322|[{1205, 14.878865...|
|            362|[{547, 20.989742}...|
|            392|[{693, 28.349457}...|
|            451|[{1205, 26.252876...|
|            458|[{652, 15.460415}...|
|            463|[{678, 14.186287}...|
|            471|[{855, 19.744804}...|
+---------------+--------------------+
only showing top 20 rows



In [15]:
itemRecs = model.recommendForAllItems(5)
itemRecs.show()

+------------------+--------------------+
|product_id_indexed|     recommendations|
+------------------+--------------------+
|                 1|[{195, 8.66188}, ...|
|                 3|[{318, 6.4674683}...|
|                 5|[{218, 8.274359},...|
|                 6|[{304, 4.9963703}...|
|                 9|[{218, 8.599127},...|
|                12|[{137, 14.832996}...|
|                13|[{116, 8.715103},...|
|                15|[{318, 7.869201},...|
|                16|[{183, 8.336919},...|
|                17|[{261, 8.857319},...|
|                19|[{183, 8.354104},...|
|                20|[{43, 6.2594366},...|
|                22|[{366, 8.1045685}...|
|                26|[{42, 7.89459}, {...|
|                27|[{196, 12.635218}...|
|                28|[{265, 6.9876404}...|
|                31|[{137, 8.740199},...|
|                34|[{65, 10.110375},...|
|                37|[{318, 8.333307},...|
|                40|[{257, 11.876069}...|
+------------------+--------------