# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [63]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf,col,when
import numpy as np




In [65]:
spark = SparkSession.builder.appName('Recommendation').getOrCreate()




In [66]:
sc = spark.sparkContext
sqlContext=SQLContext(sc)




In [67]:
ratings_df=spark.read.csv('s3://collinson-task-bucket/ratings.csv',inferSchema=True,header=True)
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)


In [68]:
ratings_df.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows


In [69]:
movies_df=spark.read.csv('s3://collinson-task-bucket/movies.csv',inferSchema=True,header=True)
movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)


In [70]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [71]:
links_df=spark.read.csv('s3://collinson-task-bucket/links.csv',inferSchema=True,header=True)
links_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)


In [72]:
training_df,validation_df=ratings_df.randomSplit([0.8,0.2])




In [73]:
iterations=10
regularization_parameter=0.1
rank=4
error=[]
err=0




In [74]:
als = ALS(maxIter=iterations,regParam=regularization_parameter,rank=5,userCol="userId",itemCol="movieId",ratingCol="rating")
model=als.fit(training_df)
predictions=model.transform(validation_df)
new_predictions=predictions.filter(col('prediction')!=np.nan)
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
rmse=evaluator.evaluate(new_predictions)
print("Root Mean Square Error="+str(rmse))

Root Mean Square Error=0.8883878490081075


In [75]:
for rank in range(4,10):
    als = ALS(maxIter=iterations,regParam=regularization_parameter,rank=rank,userCol="userId",itemCol="movieId",ratingCol="rating")
    model=als.fit(training_df)
    predictions=model.transform(validation_df)
    new_predictions=predictions.filter(col('prediction')!=np.nan)
    evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
    rmse=evaluator.evaluate(new_predictions)
    print("Root Mean Square Error="+str(rmse))
    

Root Mean Square Error=0.8853513503062906
Root Mean Square Error=0.8883878490081075
Root Mean Square Error=0.8809132293698405
Root Mean Square Error=0.8863081614713044
Root Mean Square Error=0.8868264891075929
Root Mean Square Error=0.8833562944459635


In [78]:
predictions.show(n=10)

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|      3|   4.0|964981247| 3.8932889|
|     1|    101|   5.0|964980868| 4.5461535|
|     1|    157|   5.0|964984100| 2.9828806|
|     1|    223|   3.0|964980985|   4.75011|
|     1|    590|   4.0|964982546|  4.266899|
|     1|    673|   3.0|964981775|  3.081884|
|     1|    943|   4.0|964983614| 4.3108025|
|     1|   1030|   3.0|964982903| 3.4207942|
|     1|   1032|   5.0|964982791| 4.1412992|
|     1|   1042|   4.0|964981179| 3.4126763|
+------+-------+------+---------+----------+
only showing top 10 rows


In [80]:
predictions.join(movies_df,"movieId").select("userId","title","genres","prediction").show(10)

+------+--------------------+--------------------+----------+
|userId|               title|              genres|prediction|
+------+--------------------+--------------------+----------+
|    12| Little Women (1994)|               Drama|  4.358953|
|    12|Groundhog Day (1993)|Comedy|Fantasy|Ro...|  4.321611|
|    12| Billy Elliot (2000)|               Drama| 5.2798014|
|    18|         Heat (1995)|Action|Crime|Thri...| 3.9423351|
|    18|       Casino (1995)|         Crime|Drama|  4.074382|
|    18|  Taxi Driver (1976)|Crime|Drama|Thriller| 4.0844536|
|    18|Die Hard: With a ...|Action|Crime|Thri...| 3.4808984|
|    18|Lion King, The (1...|Adventure|Animati...| 3.9378774|
|    18|Naked Gun 33 1/3:...|       Action|Comedy| 2.7522535|
|    18|Carlito's Way (1993)|         Crime|Drama| 3.8539643|
+------+--------------------+--------------------+----------+
only showing top 10 rows


In [85]:
for_one_user = predictions.filter(col("userId")==12).join(movies_df,"movieId").join(links_df,"movieId").select("userId","title","genres","tmdbId","prediction")
for_one_user.show(10)

+------+--------------------+--------------------+------+----------+
|userId|               title|              genres|tmdbId|prediction|
+------+--------------------+--------------------+------+----------+
|    12| Little Women (1994)|               Drama|  9587|  4.358953|
|    12|Groundhog Day (1993)|Comedy|Fantasy|Ro...|   137|  4.321611|
|    12| Billy Elliot (2000)|               Drama|    71| 5.2798014|
+------+--------------------+--------------------+------+----------+


In [86]:
userRecommends=model.recommendForAllUsers(5)
movieRecommends=model.recommendForAllItems(5)




In [87]:
userRecommends.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)


In [88]:
userRecommends.select("userId","recommendations.movieId").show(10,False)

+------+------------------------------------+
|userId|movieId                             |
+------+------------------------------------+
|1     |[720, 51931, 5915, 132333, 5490]    |
|2     |[131724, 78836, 1241, 86377, 171495]|
|3     |[1194, 70946, 5746, 6835, 5181]     |
|5     |[51931, 89904, 7096, 8477, 6818]    |
|10    |[32892, 542, 42730, 90439, 74946]   |
|12    |[32892, 27611, 68073, 945, 3022]    |
|16    |[51931, 3022, 3379, 7767, 28]       |
|18    |[51931, 177593, 3379, 3022, 171495] |
|20    |[7096, 51931, 3088, 26258, 7815]    |
|22    |[27611, 1014, 7116, 61350, 27251]   |
+------+------------------------------------+
only showing top 10 rows


In [89]:
movieRecommends.printSchema()

root
 |-- movieId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- userId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)


In [90]:
movieRecommends.select("movieId","recommendations.userId").show(10,False)

+-------+-------------------------+
|movieId|userId                   |
+-------+-------------------------+
|12     |[258, 53, 558, 327, 413] |
|18     |[53, 12, 35, 258, 43]    |
|38     |[543, 594, 584, 544, 192]|
|70     |[53, 452, 276, 441, 171] |
|93     |[543, 53, 594, 43, 554]  |
|161    |[543, 53, 584, 594, 544] |
|186    |[543, 584, 35, 594, 43]  |
|190    |[224, 275, 603, 138, 98] |
|218    |[485, 236, 96, 164, 267] |
|225    |[543, 53, 243, 544, 584] |
+-------+-------------------------+
only showing top 10 rows


In [91]:
users=ratings_df.select("userId").distinct().limit(5)
users.show()

+------+
|userId|
+------+
|    12|
|    18|
|    38|
|    67|
|    70|
+------+


In [92]:
userSubsetRecs = model.recommendForUserSubset(users,10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    12|[{32892, 6.124181...|
|    70|[{51931, 5.329582...|
|    67|[{26171, 5.450884...|
|    18|[{51931, 4.992954...|
|    38|[{26171, 4.870558...|
+------+--------------------+


In [93]:
userSubsetRecs.select("userId","recommendations.movieId").show(10,False)

+------+---------------------------------------------------------------------+
|userId|movieId                                                              |
+------+---------------------------------------------------------------------+
|12    |[32892, 27611, 68073, 945, 3022, 157296, 177593, 7121, 45503, 166568]|
|70    |[51931, 720, 3022, 7096, 28, 8477, 26258, 27523, 6442, 8235]         |
|67    |[26171, 32892, 86347, 177593, 92535, 103984, 1366, 40148, 3379, 3022]|
|18    |[51931, 177593, 3379, 3022, 171495, 28, 7767, 78836, 27523, 6442]    |
|38    |[26171, 1949, 1262, 1250, 25850, 6650, 5915, 5490, 132333, 3089]     |
+------+---------------------------------------------------------------------+


In [94]:
movies=ratings_df.select("movieId").distinct().limit(5)
movies.show()

+-------+
|movieId|
+-------+
|     70|
|    673|
|   1030|
|   1092|
|   1256|
+-------+


In [95]:
movieSubsetRecs = model.recommendForItemSubset(movies,10)
movieSubsetRecs.select("movieId","recommendations.userId").show(10,False)

+-------+------------------------------------------------+
|movieId|userId                                          |
+-------+------------------------------------------------+
|1092   |[53, 543, 375, 276, 236, 171, 243, 371, 99, 122]|
|1256   |[502, 99, 499, 371, 53, 393, 505, 69, 40, 1]    |
|673    |[53, 243, 543, 276, 492, 43, 35, 337, 236, 548] |
|70     |[53, 452, 276, 441, 171, 375, 37, 533, 549, 258]|
|1030   |[276, 502, 43, 543, 53, 236, 35, 243, 505, 99]  |
+-------+------------------------------------------------+


In [98]:
movie_ids=[1580,3175,2366,1590]
user_ids=[543,543,543,543]
new_user_preds=sqlContext.createDataFrame(zip(movie_ids,user_ids),schema=['movieId','userId'])
new_predictions=model.transform(new_user_preds)
new_predictions.show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   1580|   543| 4.5465755|
|   3175|   543|  4.225012|
|   2366|   543| 4.3932304|
|   1590|   543|  2.048086|
+-------+------+----------+
