In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('recommender').getOrCreate()

In [3]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
data=spark.read.csv('ratings.csv', header=True, inferSchema=True)

In [5]:
data.head()

Row(user id=196, movie id=242, rating=3, timestamp=881250949)

In [6]:
data.describe().show()

+-------+------------------+------------------+------------------+-----------------+
|summary|           user id|          movie id|            rating|        timestamp|
+-------+------------------+------------------+------------------+-----------------+
|  count|            100000|            100000|            100000|           100000|
|   mean|         462.48475|         425.53013|           3.52986|8.8352885148862E8|
| stddev|266.61442012750905|330.79835632558473|1.1256735991443214|5343856.189502848|
|    min|                 1|                 1|                 1|        874724710|
|    max|               943|              1682|                 5|        893286638|
+-------+------------------+------------------+------------------+-----------------+



In [20]:
(training,test)=data.randomSplit([0.8,0.2])

In [21]:
als=ALS(maxIter=5, regParam=1, userCol="user id", itemCol="movie id", ratingCol="rating")

In [22]:
model=als.fit(training)

In [23]:
predictions=model.transform(test)

In [24]:
predictions.show()

+-------+--------+------+---------+----------+
|user id|movie id|rating|timestamp|prediction|
+-------+--------+------+---------+----------+
|    580|     148|     4|884125773| 2.3669012|
|     27|     148|     3|891543129|  2.168142|
|    297|     148|     3|875239619| 2.2305255|
|    430|     148|     2|877226047| 2.2234983|
|    347|     148|     3|881652888| 2.3934572|
|    447|     148|     4|878854729| 2.3412983|
|    486|     148|     2|879874903| 2.1439075|
|    586|     148|     3|884065745| 2.1741352|
|    423|     148|     3|891395417|  2.309734|
|    761|     148|     5|876189829| 1.8038291|
|    880|     148|     2|880167030| 2.3259535|
|     59|     148|     3|888203175| 2.5290585|
|    717|     148|     3|884642958| 2.4688659|
|    130|     148|     4|876251127| 2.6808667|
|    438|     148|     5|879868443| 2.6435263|
|    532|     148|     5|888817717| 2.7315097|
|    821|     148|     3|874792650| 2.7685602|
|    533|     148|     3|882902641|   2.20773|
|    459|    

In [25]:
evaluator=RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse=evaluator.evaluate(predictions)


In [26]:
print ("Root-Mean-Square-error", str(rmse))

Root-Mean-Square-error nan


In [27]:
test.head()

Row(user id=1, movie id=2, rating=3, timestamp=876893171)

In [30]:
single_user=test.filter(test["user id"]==1).select(["movie id", 'user id','rating'])

In [31]:
single_user.show()

+--------+-------+------+
|movie id|user id|rating|
+--------+-------+------+
|       2|      1|     3|
|       9|      1|     5|
|      10|      1|     3|
|      23|      1|     4|
|      30|      1|     3|
|      33|      1|     4|
|      34|      1|     2|
|      37|      1|     2|
|      43|      1|     4|
|      46|      1|     4|
|      48|      1|     5|
|      53|      1|     3|
|      58|      1|     4|
|      62|      1|     3|
|      78|      1|     1|
|      82|      1|     5|
|      85|      1|     3|
|      88|      1|     4|
|      95|      1|     4|
|      96|      1|     5|
+--------+-------+------+
only showing top 20 rows



In [32]:
recommendations=model.transform(single_user)

In [33]:
recommendations.orderBy('prediction', ascending=False).show()

+--------+-------+------+----------+
|movie id|user id|rating|prediction|
+--------+-------+------+----------+
|     251|      1|     4| 3.2917657|
|     114|      1|     5| 3.2439075|
|     127|      1|     5| 3.1910944|
|      48|      1|     5|  3.108524|
|     100|      1|     5| 3.0855145|
|     191|      1|     5| 3.0717812|
|      23|      1|     4| 3.0496716|
|      96|      1|     5| 2.9978228|
|     242|      1|     5| 2.9899044|
|      30|      1|     3| 2.9528842|
|     135|      1|     4|  2.952226|
|     171|      1|     5| 2.9359472|
|     216|      1|     5| 2.9275725|
|     195|      1|     5| 2.9255419|
|     137|      1|     5| 2.8924944|
|     238|      1|     4| 2.8901763|
|       9|      1|     5| 2.8705568|
|     265|      1|     4|  2.861402|
|     200|      1|     3| 2.8536363|
|      95|      1|     4|  2.850968|
+--------+-------+------+----------+
only showing top 20 rows



In [42]:
als=ALS(maxIter=2, regParam=0.01, rank=20,
        userCol="user id", itemCol="movie id", ratingCol="rating",
       coldStartStrategy='drop', implicitPrefs=False)

In [43]:
model=als.fit(training)

In [44]:
predictions=model.transform(test)

In [45]:
predictions.show()

+-------+--------+------+---------+----------+
|user id|movie id|rating|timestamp|prediction|
+-------+--------+------+---------+----------+
|    580|     148|     4|884125773|  2.773016|
|     27|     148|     3|891543129| 4.6647367|
|    297|     148|     3|875239619| 3.1634011|
|    430|     148|     2|877226047| 3.7757752|
|    347|     148|     3|881652888| 4.3568373|
|    447|     148|     4|878854729| 3.7360222|
|    486|     148|     2|879874903| 2.6352508|
|    586|     148|     3|884065745| 3.6740649|
|    423|     148|     3|891395417| 3.1206653|
|    761|     148|     5|876189829| 5.9695206|
|    880|     148|     2|880167030|  2.753314|
|     59|     148|     3|888203175| 2.8383899|
|    717|     148|     3|884642958| 3.1923501|
|    130|     148|     4|876251127| 3.9324439|
|    438|     148|     5|879868443| 5.0349298|
|    532|     148|     5|888817717| 4.3282824|
|    821|     148|     3|874792650| 3.3169317|
|    533|     148|     3|882902641| 2.7805774|
|    459|    

In [46]:
evaluator=RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse=evaluator.evaluate(predictions)


In [47]:
print ("Root-Mean-Square-error", str(rmse))

Root-Mean-Square-error 1.1544256269205355


In [48]:
userRecs=model.recommendForAllUsers(10)

In [49]:
userRecs.count()

943

In [50]:
movieRecs=model.recommendForAllItems(10)

In [51]:
movieRecs.count()

1653

In [53]:
userRecs_df=userRecs.toPandas()
print (userRecs_df.shape)

movieRecs_df=movieRecs.toPandas()
print (movieRecs_df.shape)

(943, 2)
(1653, 2)


In [54]:
userRecs_df.head()

Unnamed: 0,user id,recommendations
0,471,"[(904, 17.67246437072754), (1446, 16.667100906..."
1,463,"[(1262, 6.873754501342773), (865, 6.3376326560..."
2,833,"[(1512, 5.641695022583008), (320, 5.2123570442..."
3,496,"[(1166, 6.600992679595947), (348, 5.9584903717..."
4,148,"[(1166, 9.941758155822754), (1084, 9.068203926..."


In [55]:
movieRecs_df.head()

Unnamed: 0,movie id,recommendations
0,1580,"[(726, 2.368070602416992), (310, 2.25766754150..."
1,471,"[(688, 10.665386199951172), (351, 6.8670892715..."
2,1591,"[(68, 9.890932083129883), (765, 9.773470878601..."
3,1342,"[(471, 5.385213851928711), (845, 4.82445144653..."
4,463,"[(153, 8.564023971557617), (635, 8.50160408020..."


In [69]:
als=ALS(userCol="user id", itemCol="movie id", ratingCol="rating",
       coldStartStrategy='drop', nonnegative=True)
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
param_grid=ParamGridBuilder().addGrid(als.rank,[8,10,14]).addGrid(als.maxIter, [18,19,20]).addGrid(als.regParam, [.10,.15,.20]).build()

In [74]:
evaluator=RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")



In [78]:
tvs=TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid,
                    evaluator=evaluator)

In [79]:
model=tvs.fit(training)

In [81]:
best_model=model.bestModel

In [82]:
predictions=best_model.transform(test)
rmse=evaluator.evaluate(predictions)
print ("Root-Mean-Square-error", str(rmse))

Root-Mean-Square-error 0.9193004092797963
