In [10]:
from pyspark.sql import SparkSession 
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.recommendation import ALS 
spark = SparkSession.builder.appName('Recommender').getOrCreate() 

In [11]:
movies_df = spark.read.json('movies.json') 

In [12]:
movies_df.show()

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This movie needed...|1164844800|A1I7QGUDP043DG|
|        1/1|B003AI2VGA|        golgotha.gov|THE VIRGIN OF JUA...|  3.0|distantly based o...|1197158400|A1M5405JH9THP9|
|        1/1|B003AI2VGA|KerrLines "&#34;M...|Informationally, ...|  3.0|"What's going on ...|1188345600| ATXL536YX71TR|
|        0/0|B003AI2VGA|abra "a devoted 

In [13]:
movies_df.schema

StructType([StructField('helpfulness', StringType(), True), StructField('product_id', StringType(), True), StructField('profile_name', StringType(), True), StructField('review', StringType(), True), StructField('score', DoubleType(), True), StructField('summary', StringType(), True), StructField('time', LongType(), True), StructField('user_id', StringType(), True)])

In [14]:
from pyspark.ml.feature import StringIndexer
indexer1 = StringIndexer(inputCol="user_id", outputCol="users")
indexerModel1 = indexer1.fit(movies_df)
movies_df = indexerModel1.transform(movies_df)
indexer2 = StringIndexer(inputCol="product_id", outputCol="products")
indexerModel2 = indexer2.fit(movies_df)
movies_df = indexerModel2.transform(movies_df)

In [18]:
train_data, test_data = movies_df.randomSplit([0.8, 0.2])

In [19]:
als = ALS(maxIter=5, 
          regParam=0.01, 
          userCol="users", 
          itemCol="products", 
          ratingCol="score",
          coldStartStrategy="drop") 
  
model = als.fit(train_data)

In [21]:
predictions = model.transform(test_data) 

In [22]:
predictions.show()

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+------+--------+-----------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id| users|products| prediction|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+------+--------+-----------+
|        0/0|B000UGBOT0|              T-Rexx|If there were thr...|  5.0|How to learn abou...|1090454400|A11BZ39QSLERNI|1959.0|    78.0|  0.1397437|
|        7/7|B0001G6PZC|      David Anderson|"The Last Samarai...|  5.0|Tom Cruise Triump...|1080086400|A1YQ6QB2127AJ4| 471.0|     7.0|  2.4011946|
|        1/1|B002OHDRF2|Peter J. Miller "...|I did not really ...|  2.0|I hope this is no...|1320624000| AU25GMX57XBVO| 243.0|    21.0|  2.0860286|
|        1/4|B002OHDRF2|Romeraux Allen "T...|Im am a huge Term...|  5.0|The Best Terminat...|1259539200|A294UQY2

In [24]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score",predictionCol="prediction") 
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.449856582309332
