In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
#Session oluşturma
sparkSession = SparkSession.builder.appName("Veri Maratonu ALS Tavsiye Sistemi").getOrCreate()

In [0]:
sparkSession

In [0]:
rawDF = sparkSession.read.csv('/FileStore/tables/ratings.csv',header=True,inferSchema=True)

In [0]:
rawDF.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [0]:
#Kullanılmayacağı için 'timestamp' sütununu çıkardık
mlDF = rawDF.drop('timestamp')

In [0]:
mlDF.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   1172|   4.0|
|     1|   1263|   2.0|
|     1|   1287|   2.0|
|     1|   1293|   2.0|
|     1|   1339|   3.5|
|     1|   1343|   2.0|
|     1|   1371|   2.5|
|     1|   1405|   1.0|
|     1|   1953|   4.0|
|     1|   2105|   4.0|
|     1|   2150|   3.0|
|     1|   2193|   2.0|
|     1|   2294|   2.0|
|     1|   2455|   2.5|
|     1|   2968|   1.0|
|     1|   3671|   3.0|
+------+-------+------+
only showing top 20 rows



In [0]:
#Veri içerisinde null olan var mı onu kontrol ediyoruz
mlDF.filter("userId is null").show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
+------+-------+------+



In [0]:
#Veri içerisinde rating'i 0 dan küçük film var mı
mlDF.filter("rating < 0").show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
+------+-------+------+



In [0]:
# ALS modülünü ayarlamak
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop" ,nonnegative=True)

In [0]:
#Train test split
(trainDF, testDF) = mlDF.randomSplit([0.8, 0.2])

In [0]:
#Model eğitimi
model = als.fit(trainDF)

In [0]:
#Test verisi üzerinde tahmin yürütme
predictDF = model.transform(testDF)

In [0]:
#Tahmin sonuçlarını görüntüle
predictDF.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|    185|   3.0| 3.1870916|
|   148|    480|   4.5| 3.8627403|
|   148|    585|   2.5|  2.776199|
|   148|    589|   3.5|   4.16424|
|   148|    596|   4.5| 4.2447176|
|   148|    648|   4.0|  3.562644|
|   148|    780|   3.5|  3.659402|
|   148|    924|   5.0| 3.6088214|
|   148|   1249|   3.5| 4.1993036|
|   148|   1261|   3.0| 4.0345793|
|   148|   1617|   4.5|  4.190187|
|   148|   1625|   4.0|  3.746406|
|   148|   1690|   3.0| 3.3343732|
|   148|   1923|   4.0| 3.5083473|
|   148|   2628|   2.5| 3.3511045|
|   148|   3175|   4.0| 3.6548128|
|   148|   3176|   3.5| 3.6532955|
|   148|   3362|   4.0|  4.346328|
|   148|   3476|   4.5| 3.7819812|
|   148|   3671|   4.0| 4.1165185|
+------+-------+------+----------+
only showing top 20 rows



In [0]:
#Modelin ne kadar başarılı olduğunu rmse error hesabı ile göreceğiz
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")

In [0]:
rmse = evaluator.evaluate(predictDF)

In [0]:
print(rmse)

0.9119571130078931


In [0]:
#Her bir user için 3 film önermesini isteyelim
userRecommendDF = model.recommendForAllUsers(numItems=3)

In [0]:
#Her bir film için 3 kullanıcı öner
itemRecommendDF = model.recommendForAllItems(numUsers=3)