## Spark Initialization 

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Recommendation System") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001E3D78EA7B8>


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import types

## Praproses

In [8]:
lines = spark.read.csv("F:/Semester 6/BigData/Tugas Recommendation System/ml-10M100K/ratings.csv", header=True, inferSchema=True)

In [9]:
lines = lines.withColumn("userId", lines["userId"].cast("int"))
lines = lines.withColumn("movieId", lines["movieId"].cast("int"))
lines = lines.withColumn("rating", lines["rating"].cast("float"))

lines.show()

+------+-------+------+----------+
|userId|movieId|rating|timestamps|
+------+-------+------+----------+
|     1|    122|   5.0| 838985046|
|     1|    185|   5.0| 838983525|
|     1|    231|   5.0| 838983392|
|     1|    292|   5.0| 838983421|
|     1|    316|   5.0| 838983392|
|     1|    329|   5.0| 838983392|
|     1|    355|   5.0| 838984474|
|     1|    356|   5.0| 838983653|
|     1|    362|   5.0| 838984885|
|     1|    364|   5.0| 838983707|
|     1|    370|   5.0| 838984596|
|     1|    377|   5.0| 838983834|
|     1|    420|   5.0| 838983834|
|     1|    466|   5.0| 838984679|
|     1|    480|   5.0| 838983653|
|     1|    520|   5.0| 838984679|
|     1|    539|   5.0| 838984068|
|     1|    586|   5.0| 838984068|
|     1|    588|   5.0| 838983339|
|     1|    589|   5.0| 838983778|
+------+-------+------+----------+
only showing top 20 rows



## Create Model

In [11]:
(training, test) = lines.randomSplit([0.8, 0.2])

In [12]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [13]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8217652411054485


In [14]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [15]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[8660, 7.8393764...|
|   463|[[8682, 7.3974705...|
|   471|[[32444, 15.41407...|
|   496|[[32444, 11.85937...|
|   833|[[26326, 7.766531...|
|  1088|[[26326, 7.767759...|
|  1238|[[33264, 6.582766...|
|  1342|[[26326, 9.452552...|
|  1580|[[26326, 10.80178...|
|  1591|[[36545, 18.76777...|
|  1645|[[60654, 7.470299...|
|  1829|[[7411, 12.046809...|
|  1959|[[2343, 10.261526...|
|  2122|[[26326, 6.42405]...|
|  2142|[[52413, 7.428848...|
|  2366|[[26241, 10.38987...|
|  2659|[[52413, 9.54597]...|
|  2866|[[5778, 12.574363...|
|  3175|[[4379, 10.918962...|
|  3749|[[26326, 7.555282...|
+------+--------------------+
only showing top 20 rows



In [16]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[30451, 5.769893...|
|   4900|[[13639, 7.525706...|
|   5300|[[56506, 8.086524...|
|   6620|[[58487, 5.906659...|
|   7240|[[10459, 7.862756...|
|   7340|[[21652, 6.777546...|
|   7880|[[8552, 10.787275...|
|  30970|[[48987, 15.29576...|
|  32460|[[64282, 11.08503...|
|  54190|[[13639, 9.541878...|
|  57370|[[22236, 15.02211...|
|    471|[[6454, 6.4372754...|
|   1591|[[62350, 6.254569...|
|   4101|[[22236, 15.56514...|
|  63271|[[35184, 3.099855...|
|   1342|[[29910, 7.364708...|
|   2122|[[36629, 8.484212...|
|   2142|[[2671, 5.953084]...|
|   7982|[[9506, 8.291946]...|
|   8592|[[57354, 6.939225...|
+-------+--------------------+
only showing top 20 rows

