### Collaborative Filtering (ALS) 

- numBlocks (-1 imply auto-config)
- rank
- iterations
- lambda: regularization
- implicitPref
- alpha 

In [46]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [47]:
spark = SparkSession.builder \
    .appName('movielens').getOrCreate()

In [48]:
base_path = '/Users/hyunseokjung/data/ml-latest-small/'

df = spark.read.csv(base_path + 'ratings.csv', inferSchema=True, header=True).repartition(10)

In [49]:
df.describe().show()

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914634999|35530.9871987002|1.0425292390606344| 2.162610359951315E8|
|    min|                 1|               1|               0.5|           828124615|
|    max|               610|          193609|               5.0|          1537799250|
+-------+------------------+----------------+------------------+--------------------+



In [50]:
df = df.select('movieId', 'userId', 'rating')

In [51]:
(train, test) = df.randomSplit([.7, .3], seed=42)

In [52]:
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating'
)

In [53]:
model = als.fit(train)

In [54]:
pred = model.transform(test)

In [55]:
pred.show()

+-------+------+------+----------+
|movieId|userId|rating|prediction|
+-------+------+------+----------+
|      1|    46|   5.0| 4.9376774|
|      1|   103|   4.0| 3.6570575|
|      1|   160|   4.0|  2.740988|
|      1|   166|   5.0| 4.3405704|
|      1|   385|   4.0| 3.7219534|
|      1|   470|   4.0|  4.288341|
|      1|   471|   5.0| 3.4242098|
|      1|   579|   4.0|  4.706253|
|      1|   597|   4.0| 3.8673086|
|      2|    20|   3.0| 3.6279657|
|      2|    27|   4.0|  4.062544|
|      2|   144|   3.0| 3.9259186|
|      2|   249|   4.0|  3.887812|
|      2|   276|   4.0| 4.4928684|
|      2|   330|   1.5| 3.2290287|
|      2|   446|   3.0|  3.523339|
|      2|   534|   4.5|  3.241715|
|      3|    44|   3.0| 2.6066937|
|      3|   217|   1.0| 2.4475992|
|      3|   477|   3.0| 3.0328598|
+-------+------+------+----------+
only showing top 20 rows



In [56]:
eval = RegressionEvaluator(
    metricName='rmse', 
    labelCol='rating',
    predictionCol='prediction')

In [57]:
rmse = eval.evaluate(pred)
print(f'RMSE: {rmse}')

RMSE: nan


In [58]:
user_1 = test.filter(test['userId'] == 1).select(['movieId', 'userId'])

In [59]:
user_1.show()

+-------+------+
|movieId|userId|
+-------+------+
|    673|     1|
|   1042|     1|
|   1219|     1|
|   1282|     1|
|   2028|     1|
|   2137|     1|
|   2648|     1|
|   2654|     1|
|   3527|     1|
|    457|     1|
|    500|     1|
|   1073|     1|
|   1092|     1|
|   1127|     1|
|   1240|     1|
|   1258|     1|
|   1298|     1|
|   1587|     1|
|   2406|     1|
|   2470|     1|
+-------+------+
only showing top 20 rows



In [60]:
rec = model.transform(user_1)

In [61]:
rec.orderBy('prediction', ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   2427|     1| 5.8329196|
|   1073|     1| 5.6475887|
|   1732|     1|  5.564292|
|   1196|     1|  5.473176|
|   1136|     1|  5.467792|
|   1298|     1| 5.4560285|
|    954|     1| 5.4171743|
|    260|     1|  5.373238|
|   2858|     1| 5.3519425|
|   2648|     1| 5.1542683|
|   2985|     1| 5.0574665|
|   2470|     1|   5.02183|
|   1258|     1| 4.9590063|
|   2797|     1|  4.951475|
|   1208|     1|  4.929792|
|   1240|     1| 4.8720603|
|   2137|     1| 4.8472567|
|   1219|     1|  4.842569|
|    151|     1|  4.660032|
|    457|     1| 4.6316085|
+-------+------+----------+
only showing top 20 rows



In [62]:
from pyspark.sql.types import StructType, IntegerType, StringType

movies = spark.read.csv(base_path + 'movies.csv', header=True).repartition(5)
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|  52462|Aqua Teen Hunger ...|Action|Adventure|...|
| 180031|The Shape of Wate...|Adventure|Drama|F...|
|    458|Geronimo: An Amer...|       Drama|Western|
|   2900|Monkey Shines (1988)|       Horror|Sci-Fi|
|   7481|   Enemy Mine (1985)|Adventure|Drama|S...|
+-------+--------------------+--------------------+
only showing top 5 rows



In [63]:
movies_ratings = movies.join(df, movies.movieId==df.movieId).drop(df.movieId)
movies_ratings.show()

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|userId|rating|
+-------+--------------------+--------------------+------+------+
|    356| Forrest Gump (1994)|Comedy|Drama|Roma...|    84|   5.0|
|    502|Next Karate Kid, ...|Action|Children|R...|    43|   5.0|
|   1500|Grosse Pointe Bla...|Comedy|Crime|Romance|   522|   4.5|
|    589|Terminator 2: Jud...|       Action|Sci-Fi|   489|   3.5|
|  52319|Inglorious Bastar...|Action|Adventure|...|   298|   3.0|
|  37380|         Doom (2005)|Action|Horror|Sci-Fi|   305|   4.0|
| 134130|  The Martian (2015)|Adventure|Drama|S...|   586|   2.5|
|    327|    Tank Girl (1995)|Action|Comedy|Sci-Fi|   505|   2.5|
|  49649|       Eragon (2006)|Action|Adventure|...|   249|   2.5|
|  88356|  Smurfs, The (2011)|Animation|Childre...|   382|   2.0|
|   1921|           Pi (1998)|Drama|Sci-Fi|Thri...|    50|   3.0|
|   4995|Beautiful Mind, A...|       Drama|Romance|    20|   5.0|
|  36401|B

In [64]:
model.recommendForAllUsers(3) \
    .selectExpr("userId", "explode(recommendations)") \
    .show(6)
model.recommendForAllItems(3) \
    .selectExpr("movieId", "explode(recommendations)") \
    .show(6)

                                                                                

+------+------------------+
|userId|               col|
+------+------------------+
|     1|  {3468, 6.435892}|
|     1| {3347, 6.3699517}|
|     1|  {951, 6.2661266}|
|     2|{102123, 7.113074}|
|     2| {112623, 6.70667}|
|     2|{167746, 6.634947}|
+------+------------------+
only showing top 6 rows





+-------+----------------+
|movieId|             col|
+-------+----------------+
|     26|{329, 7.8223634}|
|     26|{224, 6.7963676}|
|     26| {99, 6.6198044}|
|     27| {147, 7.941797}|
|     27|{393, 7.8409085}|
|     27| {529, 7.449197}|
+-------+----------------+
only showing top 6 rows



                                                                                