In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-rec")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [3]:
ratings_file = "/Users/keon/fastcampus/data-engineering/01-spark/data/ml-25m/ratings.csv" 
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema = True, header = True)

In [4]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])
ratings_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    296|   5.0|
|     1|    306|   3.5|
|     1|    307|   5.0|
|     1|    665|   5.0|
|     1|    899|   3.5|
|     1|   1088|   4.0|
|     1|   1175|   3.5|
|     1|   1217|   3.5|
|     1|   1237|   5.0|
|     1|   1250|   4.0|
|     1|   1260|   3.5|
|     1|   1653|   4.0|
|     1|   2011|   2.5|
|     1|   2012|   2.5|
|     1|   2068|   2.5|
|     1|   2161|   3.5|
|     1|   2351|   4.5|
|     1|   2573|   4.0|
|     1|   2632|   5.0|
|     1|   2692|   5.0|
+------+-------+------+
only showing top 20 rows



In [6]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [7]:
ratings_df.select("rating").describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423535|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [8]:
(train_df, test_df) = ratings_df.randomSplit([0.8, 0.2])

In [9]:
# maxIter: 반복 수 
# regParam: regularization prarmeter (default 1.0)
# coldStartStrategy: 학습하지 못한 데이터에 어떻게 대처할지 선택 (drop vs nan)
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=5, 
          regParam=0.1, 
          userCol='userId', 
          itemCol='movieId', 
          ratingCol='rating',
          coldStartStrategy='drop')

In [10]:
model = als.fit(train_df)

In [12]:
predictions = model.transform(test_df)

In [13]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    31|   1580|   3.0| 2.3412633|
|    31|   3175|   1.5| 2.4575193|
|   101|   8638|   5.0| 3.5509882|
|   137|   1645|   3.0|  3.078713|
|   155|   1580|   2.5| 3.4677393|
|   243|   1580|   3.0|  2.746449|
|   243|  44022|   3.0| 2.4469478|
|   322|    463|   3.0| 3.3249109|
|   368|   3175|   5.0| 3.6138074|
|   385|   1088|   3.0| 3.2864473|
|   588|   1645|   2.5| 2.7411766|
|   596|   1580|   3.0| 3.5674548|
|   597|   1645|   5.0| 3.5412421|
|   597|   3175|   5.0| 3.8528943|
|   613|   1580|   3.0| 3.3705337|
|   633|   1591|   5.0|  3.303601|
|   642|   1580|   3.5| 3.4872642|
|   756|   1580|   4.0| 3.5168695|
|   756|  44022|   3.5| 3.1852522|
|   772|    471|   4.0| 3.4680028|
+------+-------+------+----------+
only showing top 20 rows



In [15]:
predictions.select('rating', 'prediction').describe().show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           5000395|           5000395|
|   mean|3.5343617854189517| 3.435012802644237|
| stddev| 1.060628011981919|0.6466347894252432|
|    min|               0.5|        -1.5015955|
|    max|               5.0|         6.5376873|
+-------+------------------+------------------+



In [17]:
# Root Mean Square Error
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', 
                                labelCol='rating',
                                predictionCol='prediction')

rmse = evaluator.evaluate(predictions)
print(rmse)

0.8113079596776658


In [18]:
userRecsAll = model.recommendForAllUsers(3)
userRecsAll.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[{194434, 7.51302...|
|    31|[{203882, 3.77619...|
|    34|[{139036, 6.48936...|
|    53|[{194334, 6.63180...|
|    65|[{144202, 6.29712...|
|    78|[{139036, 6.97625...|
|    81|[{173871, 4.68658...|
|    85|[{201821, 5.79802...|
|   101|[{203086, 5.04414...|
|   108|[{194434, 5.66323...|
|   115|[{203882, 6.04178...|
|   126|[{203882, 6.18815...|
|   133|[{194434, 5.66756...|
|   137|[{203086, 5.47062...|
|   148|[{194434, 5.94313...|
|   155|[{194434, 6.07637...|
|   183|[{194434, 5.92454...|
|   193|[{194434, 5.53216...|
|   210|[{139036, 9.06887...|
|   211|[{194434, 6.29485...|
+------+--------------------+
only showing top 20 rows



In [19]:
movieRecsAll = model.recommendForAllItems(3)
movieRecsAll.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     28|[{115651, 5.99797...|
|     31|[{87426, 5.399622...|
|     34|[{115651, 5.65836...|
|     53|[{115651, 6.28357...|
|     65|[{87426, 5.520707...|
|     78|[{87426, 4.701583...|
|     81|[{87426, 4.836845...|
|     85|[{115651, 5.37445...|
|    101|[{115651, 5.24595...|
|    108|[{81029, 4.957504...|
|    115|[{115651, 5.93916...|
|    126|[{87426, 4.905772...|
|    133|[{108346, 5.35546...|
|    137|[{115651, 5.20377...|
|    148|[{115651, 4.30622...|
|    155|[{105801, 5.03672...|
|    183|[{87426, 5.213244...|
|    193|[{87426, 5.025292...|
|    210|[{87426, 4.675188...|
|    211|[{115651, 5.24861...|
+-------+--------------------+
only showing top 20 rows



In [21]:
from pyspark.sql.types import IntegerType

# Generally we dont need to look for all user
# Create a list of users, whom we want to recommend
user_list = [148, 463, 267]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')

users_df.take(3)

[Row(userId=148), Row(userId=463), Row(userId=267)]

In [24]:
# top 5 rec for subsets
user_recs = model.recommendForUserSubset(users_df, 5)
user_recs.toPandas()



Unnamed: 0,userId,recommendations
0,148,"[(194434, 5.943130970001221), (183947, 5.56757..."
1,463,"[(194434, 6.498035430908203), (201821, 6.13922..."
2,267,"[(194434, 5.766074180603027), (151989, 5.49247..."


In [30]:
# extract movies
movies_list = user_recs.collect()[0].recommendations
movies_list

[Row(movieId=194434, rating=5.943130970001221),
 Row(movieId=183947, rating=5.567570209503174),
 Row(movieId=203882, rating=5.5637102127075195),
 Row(movieId=166790, rating=5.414093017578125),
 Row(movieId=203086, rating=5.393131256103516)]

In [33]:
recommendations_df = spark.createDataFrame(movies_list)
recommendations_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 194434| 5.943130970001221|
| 183947| 5.567570209503174|
| 203882|5.5637102127075195|
| 166790| 5.414093017578125|
| 203086| 5.393131256103516|
+-------+------------------+



In [32]:
# attach movie names
movies_file = "/Users/keon/fastcampus/data-engineering/01-spark/data/ml-25m/movies.csv" 
movies_df = spark.read.csv(movies_file,
                           header=True,
                           ignoreLeadingWhiteSpace= True)
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [34]:
recommendations_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [43]:
query = """
SELECT
    *
FROM
    movies JOIN recommendations
    ON movies.movieId = recommendations.movieId
ORDER BY rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+------------------+-------+------------------+
|movieId|               title|            genres|movieId|            rating|
+-------+--------------------+------------------+-------+------------------+
| 194434|   Adrenaline (1990)|(no genres listed)| 194434| 5.943130970001221|
| 183947|NOFX Backstage Pa...|(no genres listed)| 183947| 5.567570209503174|
| 203882|Dead in the Water...|            Horror| 203882|5.5637102127075195|
| 166790|My Future Love (2...|(no genres listed)| 166790| 5.414093017578125|
| 203086|Truth and Justice...|             Drama| 203086| 5.393131256103516|
+-------+--------------------+------------------+-------+------------------+



In [44]:
from pyspark.sql.types import IntegerType

def get_recommendation(user_id, num_recs):
    users_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_rec_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_list = userRecs.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    
    recommended_movies = spark.sql(query)
    return recommended_movies

In [45]:
recs = get_recommendation(456, 10)
recs.toPandas()



Unnamed: 0,movieId,title,genres,movieId.1,rating
0,194434,Adrenaline (1990),(no genres listed),194434,5.943131
1,183947,NOFX Backstage Passport 2,(no genres listed),183947,5.56757
2,203882,Dead in the Water (2006),Horror,203882,5.56371
3,166790,My Future Love (2016),(no genres listed),166790,5.414093
4,203086,Truth and Justice (2019),Drama,203086,5.393131


In [46]:
spark.stop()