In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Recommendation Systems") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x0000013B3D403470>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import types

## Praproses

In [7]:
ratings = spark.read.csv("D:/Spark/Tugas4/ml-1m/ratings.csv", header=True, inferSchema=True)

In [17]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
|     1|   1197|   3.0|978302268|
|     1|   1287|   5.0|978302039|
|     1|   2804|   5.0|978300719|
|     1|    594|   4.0|978302268|
|     1|    919|   4.0|978301368|
|     1|    595|   5.0|978824268|
|     1|    938|   4.0|978301752|
|     1|   2398|   4.0|978302281|
|     1|   2918|   4.0|978302124|
|     1|   1035|   5.0|978301753|
|     1|   2791|   4.0|978302188|
|     1|   2687|   3.0|978824268|
|     1|   2018|   4.0|978301777|
|     1|   3105|   5.0|978301713|
|     1|   2797|   4.0|978302039|
+------+-------+------+---------+
only showing top 20 rows



## Buat Model

In [9]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [11]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [12]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8985696539020004


In [13]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [14]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|  1580|[[1310, 10.385049...|
|  4900|[[1725, 11.129892...|
|  5300|[[2964, 6.7413874...|
|   471|[[2192, 7.088834]...|
|  1591|[[2931, 6.7828236...|
|  4101|[[2847, 7.731414]...|
|  1342|[[1349, 6.6957526...|
|  2122|[[2545, 6.4478855...|
|  2142|[[939, 8.143927],...|
|   463|[[751, 6.943112],...|
|   833|[[2964, 14.549758...|
|  5803|[[718, 10.501891]...|
|  3794|[[769, 7.9112906]...|
|  1645|[[1312, 8.693449]...|
|  3175|[[2192, 8.199085]...|
|  4935|[[2493, 10.201505...|
|   496|[[864, 7.842164],...|
|  2366|[[1930, 6.430979]...|
|  2866|[[2545, 9.700056]...|
|  5156|[[2964, 7.2584934...|
+------+--------------------+
only showing top 20 rows



In [15]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[283, 5.306515],...|
|    471|[[4419, 6.8512573...|
|   1591|[[5863, 5.7814198...|
|   1342|[[3915, 6.3527994...|
|   2122|[[1070, 6.9072304...|
|   2142|[[2364, 5.4755573...|
|    463|[[1596, 8.50619],...|
|    833|[[5274, 6.641812]...|
|   3794|[[4383, 10.221345...|
|   1645|[[5863, 5.820272]...|
|   3175|[[1213, 6.221288]...|
|    496|[[1083, 12.426367...|
|   2366|[[1412, 5.9890537...|
|   2866|[[1989, 5.7216725...|
|    148|[[1989, 11.069554...|
|   1088|[[2138, 6.371765]...|
|   1238|[[5810, 6.2747455...|
|   3918|[[4996, 6.8078327...|
|   1829|[[491, 10.91073],...|
|   1959|[[440, 7.313812],...|
+-------+--------------------+
only showing top 20 rows

