In [2]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNWithMeans, KNNBasic, KNNBaseline

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [3]:
spark = SparkSession.builder.master('local').getOrCreate() 

In [15]:
ratings = spark.read.json('data/ratings.json')#, schema=schema)

In [16]:
ratings.persist()
ratings.dtypes

[('movie_id', 'bigint'),
 ('rating', 'bigint'),
 ('timestamp', 'double'),
 ('user_id', 'bigint')]

In [17]:
als = ALS(maxIter=10,
          rank=10,
          userCol="user_id", 
          itemCol="movie_id", 
          ratingCol="rating")

model = als.fit(ratings)

In [25]:
predictions = model.transform(ratings)
predictions.persist()
evaluator = RegressionEvaluator(metricName='rmse',
                               labelCol='rating',
                               predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.81684808831793


In [40]:
predictions.persist()
predictions.show(5)

+--------+------+------------+-------+----------+
|movie_id|rating|   timestamp|user_id|prediction|
+--------+------+------------+-------+----------+
|     148|     5|9.75592024E8|    673| 4.0313516|
|     148|     2|9.65634524E8|   4227| 2.0603485|
|     148|     4|9.68683753E8|   3184| 3.4506245|
|     148|     3| 9.6997537E8|   4784| 2.8815558|
|     148|     2|9.74388854E8|   2383|  2.319465|
+--------+------+------------+-------+----------+
only showing top 5 rows



[Row(_c0='1', _c1=None, _c2='Toy Story (1995)', _c3=None, _c4="Animation|Children's|Comedy"),
 Row(_c0='2', _c1=None, _c2='Jumanji (1995)', _c3=None, _c4="Adventure|Children's|Fantasy"),
 Row(_c0='3', _c1=None, _c2='Grumpier Old Men (1995)', _c3=None, _c4='Comedy|Romance'),
 Row(_c0='4', _c1=None, _c2='Waiting to Exhale (1995)', _c3=None, _c4='Comedy|Drama'),
 Row(_c0='5', _c1=None, _c2='Father of the Bride Part II (1995)', _c3=None, _c4='Comedy')]

In [27]:
requests = spark.read.json('data/requests.json')
requests.dtypes

[('movie_id', 'bigint'),
 ('rating', 'double'),
 ('timestamp', 'double'),
 ('user_id', 'bigint')]

In [28]:
requests.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [46]:
req_predict = model.transform(requests)

In [47]:
req_predict.show(5)

+--------+------+------------+-------+----------+
|movie_id|rating|   timestamp|user_id|prediction|
+--------+------+------------+-------+----------+
|     148|   NaN|9.77959026E8|     53|       NaN|
|     148|   NaN|9.76559602E8|   4169| 3.1198802|
|     148|   NaN|9.89024856E8|   5333| 2.3704112|
|     148|   NaN|9.77005381E8|   4387| 1.8007883|
|     148|   NaN|9.66907208E8|   3539| 2.6021094|
+--------+------+------------+-------+----------+
only showing top 5 rows



In [54]:
final_requests = req_predict.drop('rating')
final_requests = final_requests.withColumnRenamed('prediction','rating')
final_requests.show(5)

+--------+------------+-------+---------+
|movie_id|   timestamp|user_id|   rating|
+--------+------------+-------+---------+
|     148|9.77959026E8|     53|      NaN|
|     148|9.76559602E8|   4169|3.1198802|
|     148|9.89024856E8|   5333|2.3704112|
|     148|9.77005381E8|   4387|1.8007883|
|     148|9.66907208E8|   3539|2.6021094|
+--------+------------+-------+---------+
only showing top 5 rows



In [67]:
movie_titles = spark.read.csv('data/movies.dat', sep=":")
movie_titles = movie_titles.drop('_c1','_c3')
movie_titles.persist()
movie_titles.head(5)

[Row(_c0='1', _c2='Toy Story (1995)', _c4="Animation|Children's|Comedy"),
 Row(_c0='2', _c2='Jumanji (1995)', _c4="Adventure|Children's|Fantasy"),
 Row(_c0='3', _c2='Grumpier Old Men (1995)', _c4='Comedy|Romance'),
 Row(_c0='4', _c2='Waiting to Exhale (1995)', _c4='Comedy|Drama'),
 Row(_c0='5', _c2='Father of the Bride Part II (1995)', _c4='Comedy')]

In [73]:
users = spark.read.csv('data/users.dat', sep=':')
users = users.drop('_c1', '_c3', '_c5', '_c7')
users.persist()
users.show(5)

+---+---+---+---+-----+
|_c0|_c2|_c4|_c6|  _c8|
+---+---+---+---+-----+
|  1|  F|  1| 10|48067|
|  2|  M| 56| 16|70072|
|  3|  M| 25| 15|55117|
|  4|  M| 45|  7|02460|
|  5|  M| 25| 20|55455|
+---+---+---+---+-----+
only showing top 5 rows



In [76]:
old_cols = ['_c0','_c2','_c4', '_c6', '_c8']
new_cols = ['id','gender','age_group','occupation','zipcode']

def rename_cols(new_cols,old_cols,data):
    for i in range(len(old_cols)):
        data = data.withColumnRenamed(old_cols[i],new_cols[i])
    return data

users = rename_cols(new_cols,old_cols,users)
users.show(5)

+---+------+---------+----------+-------+
| id|gender|age_group|occupation|zipcode|
+---+------+---------+----------+-------+
|  1|     F|        1|        10|  48067|
|  2|     M|       56|        16|  70072|
|  3|     M|       25|        15|  55117|
|  4|     M|       45|         7|  02460|
|  5|     M|       25|        20|  55455|
+---+------+---------+----------+-------+
only showing top 5 rows



In [78]:
new = ['id','title','genre']
old = ['_c0', '_c2','_c4']

movie_titles = rename_cols(new,old,movie_titles)
movie_titles.show(5)

+---+--------------------+--------------------+
| id|               title|               genre|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Animation|Childre...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|        Comedy|Drama|
|  5|Father of the Bri...|              Comedy|
+---+--------------------+--------------------+
only showing top 5 rows



In [80]:
fulldata = ratings.join(movie_titles, ratings.movie_id == movie_titles.id)
fulldata = fulldata.join(users, fulldata.user_id == users.id)
fulldata.show(5)

+--------+------+------------+-------+----+--------------------+------------------+----+------+---------+----------+-------+
|movie_id|rating|   timestamp|user_id|  id|               title|             genre|  id|gender|age_group|occupation|zipcode|
+--------+------+------------+-------+----+--------------------+------------------+----+------+---------+----------+-------+
|     858|     4|9.56678732E8|   6040| 858|Godfather, The (1...|Action|Crime|Drama|6040|     M|       25|         6|  11106|
|    2384|     4|9.56678754E8|   6040|2384|                Babe|              null|6040|     M|       25|         6|  11106|
|     593|     5|9.56678754E8|   6040| 593|Silence of the La...|    Drama|Thriller|6040|     M|       25|         6|  11106|
|    1961|     4|9.56678777E8|   6040|1961|     Rain Man (1988)|             Drama|6040|     M|       25|         6|  11106|
|    1419|     3|9.56678856E8|   6040|1419|    Walkabout (1971)|             Drama|6040|     M|       25|         6|  11106|


In [82]:
fulldata = fulldata.drop('id')
fulldata.show(5)

+--------+------+------------+-------+--------------------+------------------+------+---------+----------+-------+
|movie_id|rating|   timestamp|user_id|               title|             genre|gender|age_group|occupation|zipcode|
+--------+------+------------+-------+--------------------+------------------+------+---------+----------+-------+
|     858|     4|9.56678732E8|   6040|Godfather, The (1...|Action|Crime|Drama|     M|       25|         6|  11106|
|    2384|     4|9.56678754E8|   6040|                Babe|              null|     M|       25|         6|  11106|
|     593|     5|9.56678754E8|   6040|Silence of the La...|    Drama|Thriller|     M|       25|         6|  11106|
|    1961|     4|9.56678777E8|   6040|     Rain Man (1988)|             Drama|     M|       25|         6|  11106|
|    1419|     3|9.56678856E8|   6040|    Walkabout (1971)|             Drama|     M|       25|         6|  11106|
+--------+------+------------+-------+--------------------+------------------+--

In [84]:
fulldata.dtypes

[('movie_id', 'bigint'),
 ('rating', 'bigint'),
 ('timestamp', 'double'),
 ('user_id', 'bigint'),
 ('title', 'string'),
 ('genre', 'string'),
 ('gender', 'string'),
 ('age_group', 'string'),
 ('occupation', 'string'),
 ('zipcode', 'string')]

In [87]:
fulldata.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719949 entries, 0 to 719948
Data columns (total 10 columns):
movie_id      719949 non-null int64
rating        719949 non-null int64
timestamp     719949 non-null float64
user_id       719949 non-null int64
title         719949 non-null object
genre         673236 non-null object
gender        719949 non-null object
age_group     719949 non-null object
occupation    719949 non-null object
zipcode       719949 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 54.9+ MB


In [88]:
fulldata = fulldata.dropna()
fulldata.show(5)

+--------+------+------------+-------+--------------------+------------------+------+---------+----------+-------+
|movie_id|rating|   timestamp|user_id|               title|             genre|gender|age_group|occupation|zipcode|
+--------+------+------------+-------+--------------------+------------------+------+---------+----------+-------+
|     858|     4|9.56678732E8|   6040|Godfather, The (1...|Action|Crime|Drama|     M|       25|         6|  11106|
|     593|     5|9.56678754E8|   6040|Silence of the La...|    Drama|Thriller|     M|       25|         6|  11106|
|    1961|     4|9.56678777E8|   6040|     Rain Man (1988)|             Drama|     M|       25|         6|  11106|
|    1419|     3|9.56678856E8|   6040|    Walkabout (1971)|             Drama|     M|       25|         6|  11106|
|     213|     5|9.56678856E8|   6040|Burnt By the Sun ...|             Drama|     M|       25|         6|  11106|
+--------+------+------------+-------+--------------------+------------------+--