In [177]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import pyspark.sql.functions as f
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import lit

In [7]:
movies = spark.read.load(
  "D:/school/Masters Data Science/Spring 2020/DSC650_Big Data/dsc650-master/dsc650-master/data/movielens/movies.csv",
  format="csv",
  sep=",",
  inferSchema=True,
  header=True)
    
ratings = spark.read.load(
  "D:/school/Masters Data Science/Spring 2020/DSC650_Big Data/dsc650-master/dsc650-master/data/movielens/ratings.csv",
  format="csv",
  sep=",",
  inferSchema=True,
  header=True)

In [8]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [18]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [19]:
movies.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [20]:
ratings.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



In [25]:
left_join = ratings.join(movies, ratings.movieId == movies.movieId,how='left').drop(movies.movieId)
left_join.show()

+------+-------+------+---------+--------------------+--------------------+
|userId|movieId|rating|timestamp|               title|              genres|
+------+-------+------+---------+--------------------+--------------------+
|     1|      1|   4.0|964982703|    Toy Story (1995)|Adventure|Animati...|
|     1|      3|   4.0|964981247|Grumpier Old Men ...|      Comedy|Romance|
|     1|      6|   4.0|964982224|         Heat (1995)|Action|Crime|Thri...|
|     1|     47|   5.0|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     1|     50|   5.0|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
|     1|     70|   3.0|964982400|From Dusk Till Da...|Action|Comedy|Hor...|
|     1|    101|   5.0|964980868|Bottle Rocket (1996)|Adventure|Comedy|...|
|     1|    110|   4.0|964982176|   Braveheart (1995)|    Action|Drama|War|
|     1|    151|   5.0|964984041|      Rob Roy (1995)|Action|Drama|Roma...|
|     1|    157|   5.0|964984100|Canadian Bacon (1...|          Comedy|War|
|     1|    

In [103]:
left_join.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



Split data into a train and test (80%, 20%)

In [155]:
(training, test) = left_join.randomSplit([0.8, 0.2])

In [157]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

Evaluate the model by computing the RMSE on the test data

In [43]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.228243711476414


In [96]:
predictions.show()

+------+-------+------+----------+------------+
|userId|movieId|rating| timestamp|  prediction|
+------+-------+------+----------+------------+
|   436|    471|   3.0| 833530187|    0.414125|
|   602|    471|   4.0| 840876085|  0.48642504|
|   599|    471|   2.5|1498518822|   1.0633253|
|   474|    471|   3.0| 974668858|  0.81399006|
|   610|    471|   4.0|1479544381|   0.6199909|
|   520|    471|   5.0|1326609921|  0.13701607|
|   171|    471|   3.0| 866905683|  0.34739882|
|   287|    471|   4.5|1110231536|  0.13957405|
|   132|   1088|   4.0|1329984080|   0.7038103|
|   169|   1088|   4.5|1059427717|  0.76083577|
|   594|   1088|   4.5|1109035643|   0.4103617|
|   555|   1088|   4.0| 978822670|0.0028916895|
|   509|   1088|   3.0|1435992808|  0.49467292|
|    10|   1088|   3.0|1455619275|   0.2956672|
|   226|   1088|   1.0|1096420160|  0.69981956|
|    68|   1088|   3.5|1158534614|   0.8226894|
|   525|   1088|   4.5|1476478367|    0.701008|
|   600|   1088|   3.5|1237851304|   0.9

In [99]:
predictions.count()

19517

In [181]:
# Generate top 10 movie recommendations for each user
user_Recs = model.recommendForAllUsers(10)
user_Recs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[125, 7.7852674]...|
|   463|[[58301, 8.079834...|
|   496|[[3040, 8.632759]...|
|   148|[[3706, 6.3504624...|
|   540|[[6380, 6.9957333...|
|   392|[[3676, 11.172484...|
|   243|[[1194, 11.535083...|
|    31|[[3706, 9.906838]...|
|   516|[[932, 7.6115174]...|
|   580|[[74946, 6.927839...|
|   251|[[3030, 8.3793745...|
|   451|[[3676, 9.67052],...|
|    85|[[1023, 6.7239294...|
|   137|[[2290, 7.179303]...|
|    65|[[3676, 7.5541143...|
|   458|[[1227, 10.629131...|
|   481|[[8372, 7.0657144...|
|    53|[[58301, 7.958368...|
|   255|[[34338, 10.44445...|
|   588|[[34338, 8.944455...|
+------+--------------------+
only showing top 20 rows



In [111]:
# Recs = user_Recs.show(10, False)

In [225]:
Recs = user_Recs.select('userID', 'recommendations')
Recs.show()

+------+--------------------+
|userID|     recommendations|
+------+--------------------+
|   471|[[125, 7.7852674]...|
|   463|[[58301, 8.079834...|
|   496|[[3040, 8.632759]...|
|   148|[[3706, 6.3504624...|
|   540|[[6380, 6.9957333...|
|   392|[[3676, 11.172484...|
|   243|[[1194, 11.535083...|
|    31|[[3706, 9.906838]...|
|   516|[[932, 7.6115174]...|
|   580|[[74946, 6.927839...|
|   251|[[3030, 8.3793745...|
|   451|[[3676, 9.67052],...|
|    85|[[1023, 6.7239294...|
|   137|[[2290, 7.179303]...|
|    65|[[3676, 7.5541143...|
|   458|[[1227, 10.629131...|
|   481|[[8372, 7.0657144...|
|    53|[[58301, 7.958368...|
|   255|[[34338, 10.44445...|
|   588|[[34338, 8.944455...|
+------+--------------------+
only showing top 20 rows



In [226]:
user_recommendations = model.recommendForAllUsers(10)
user_recommendations = Recs.withColumn("recommendation", explode("recommendations"))
user_recommendations = user_recommendations.drop('recommendations')
user_recommendations.show()

+------+------------------+
|userID|    recommendation|
+------+------------------+
|   471|  [125, 7.7852674]|
|   471|[103228, 7.425807]|
|   471|  [3200, 7.328513]|
|   471|   [799, 7.327564]|
|   471| [7169, 7.2728186]|
|   471|  [2606, 7.185232]|
|   471|[91976, 7.1404986]|
|   471|[171763, 6.820062]|
|   471|    [909, 6.81581]|
|   471|[57274, 6.8148074]|
|   463| [58301, 8.079834]|
|   463| [51931, 7.510384]|
|   463|  [3040, 7.056519]|
|   463|[72167, 6.9219775]|
|   463|  [4102, 6.764217]|
|   463| [2148, 6.7388506]|
|   463|  [417, 6.6251793]|
|   463| [2517, 6.5438347]|
|   463|[53123, 6.5398726]|
|   463| [34338, 6.513316]|
+------+------------------+
only showing top 20 rows



In [227]:
user_recommendations = user_recommendations.select(
'userId',
'recommendation.movieId',
'recommendation.rating'
)

user_recommendations.show()


+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|   471|    125|7.7852674|
|   471| 103228| 7.425807|
|   471|   3200| 7.328513|
|   471|    799| 7.327564|
|   471|   7169|7.2728186|
|   471|   2606| 7.185232|
|   471|  91976|7.1404986|
|   471| 171763| 6.820062|
|   471|    909|  6.81581|
|   471|  57274|6.8148074|
|   463|  58301| 8.079834|
|   463|  51931| 7.510384|
|   463|   3040| 7.056519|
|   463|  72167|6.9219775|
|   463|   4102| 6.764217|
|   463|   2148|6.7388506|
|   463|    417|6.6251793|
|   463|   2517|6.5438347|
|   463|  53123|6.5398726|
|   463|  34338| 6.513316|
+------+-------+---------+
only showing top 20 rows



In [243]:
new_join = left_join.drop('rating')
new_join.show()

+------+-------+---------+--------------------+--------------------+
|userId|movieId|timestamp|               title|              genres|
+------+-------+---------+--------------------+--------------------+
|     1|      1|964982703|    Toy Story (1995)|Adventure|Animati...|
|     1|      3|964981247|Grumpier Old Men ...|      Comedy|Romance|
|     1|      6|964982224|         Heat (1995)|Action|Crime|Thri...|
|     1|     47|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     1|     50|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
|     1|     70|964982400|From Dusk Till Da...|Action|Comedy|Hor...|
|     1|    101|964980868|Bottle Rocket (1996)|Adventure|Comedy|...|
|     1|    110|964982176|   Braveheart (1995)|    Action|Drama|War|
|     1|    151|964984041|      Rob Roy (1995)|Action|Drama|Roma...|
|     1|    157|964984100|Canadian Bacon (1...|          Comedy|War|
|     1|    163|964983650|    Desperado (1995)|Action|Romance|We...|
|     1|    216|964981208|Billy Ma

In [244]:
Recs_Join = user_recommendations.join(new_join, on=['userID'], how='left').drop(new_join.movieId)
Recs_Join.show()

+------+-------+---------+----------+--------------------+--------------------+
|userId|movieId|   rating| timestamp|               title|              genres|
+------+-------+---------+----------+--------------------+--------------------+
|   148|   3706|6.3504624|1482548476| Forrest Gump (1994)|Comedy|Drama|Roma...|
|   148|   3706|6.3504624|1482548478|Princess Bride, T...|Action|Adventure|...|
|   148|   3706|6.3504624|1482548613| Moulin Rouge (2001)|Drama|Musical|Rom...|
|   148|   3706|6.3504624|1482548505|Monsters, Inc. (2...|Adventure|Animati...|
|   148|   3706|6.3504624|1482548717|Harry Potter and ...|Adventure|Childre...|
|   148|   3706|6.3504624|1482548771|Lord of the Rings...|   Adventure|Fantasy|
|   148|   3706|6.3504624|1482548682|Spirited Away (Se...|Adventure|Animati...|
|   148|   3706|6.3504624|1482548755|Harry Potter and ...|   Adventure|Fantasy|
|   148|   3706|6.3504624|1482548769|Lord of the Rings...|   Adventure|Fantasy|
|   148|   3706|6.3504624|1482548514| Fi

In [245]:
columns_to_drop = ['timestamp', 'genres']
Final_Recs = Recs_Join.drop(*columns_to_drop)
Final_Recs.show()

+------+-------+---------+--------------------+
|userId|movieId|   rating|               title|
+------+-------+---------+--------------------+
|   148|   3706|6.3504624| Forrest Gump (1994)|
|   148|   3706|6.3504624|Princess Bride, T...|
|   148|   3706|6.3504624| Moulin Rouge (2001)|
|   148|   3706|6.3504624|Monsters, Inc. (2...|
|   148|   3706|6.3504624|Harry Potter and ...|
|   148|   3706|6.3504624|Lord of the Rings...|
|   148|   3706|6.3504624|Spirited Away (Se...|
|   148|   3706|6.3504624|Harry Potter and ...|
|   148|   3706|6.3504624|Lord of the Rings...|
|   148|   3706|6.3504624| Finding Nemo (2003)|
|   148|   3706|6.3504624|Lord of the Rings...|
|   148|   3706|6.3504624|Harry Potter and ...|
|   148|   3706|6.3504624|Phantom of the Op...|
|   148|   3706|6.3504624|Howl's Moving Cas...|
|   148|   3706|6.3504624|Pride & Prejudice...|
|   148|   3706|6.3504624|Harry Potter and ...|
|   148|   3706|6.3504624|V for Vendetta (2...|
|   148|   3706|6.3504624|  Ratatouille 

In [246]:
Final_Recs[Final_Recs['userId'] == 127].show(truncate=False)
Final_Recs[Final_Recs['userId'] == 151].show(truncate=False)
Final_Recs[Final_Recs['userId'] == 300].show(truncate=False)

+------+-------+---------+----------------------------------+
|userId|movieId|rating   |title                             |
+------+-------+---------+----------------------------------+
|127   |535    |11.879617|Madness of King George, The (1994)|
|127   |535    |11.879617|Vertigo (1958)                    |
|127   |535    |11.879617|Some Like It Hot (1959)           |
|127   |535    |11.879617|Maltese Falcon, The (1941)        |
|127   |535    |11.879617|Cool Hand Luke (1967)             |
|127   |535    |11.879617|Field of Dreams (1989)            |
|127   |535    |11.879617|Wag the Dog (1997)                |
|127   |535    |11.879617|West Side Story (1961)            |
|127   |535    |11.879617|Driving Miss Daisy (1989)         |
|127   |535    |11.879617|Splash (1984)                     |
|127   |535    |11.879617|Sixteen Candles (1984)            |
|127   |535    |11.879617|Fly, The (1986)                   |
|127   |535    |11.879617|Thomas Crown Affair, The (1999)   |
|127   |