In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Creating session
spark = SparkSession.builder.appName("Movie Recommendation App").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/16 15:48:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:

# Reading the ratings_small csv file

df_ratings_small  = spark.read.csv("the-movies-dataset/ratings_small.csv", header=True, inferSchema=True)

df_ratings_small.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows


In [6]:
#counting data
ratings_counts = df_ratings_small.select("rating").count()
users_count = df_ratings_small.select("userId").distinct().count()
movies_count = df_ratings_small.select("movieId").distinct().count()

print(f"ratings counts: {ratings_counts}\n user counts: {users_count}\n movies count: {movies_count}")

ratings counts: 100004
 user counts: 671
 movies count: 9066


In [8]:
df_ratings_small.groupBy("userId").count().show()

+------+-----+
|userId|count|
+------+-----+
|   148|  132|
|   463|  483|
|   471|  216|
|   496|  126|
|   243|  307|
|   392|   25|
|   540|   20|
|   623|  103|
|    31|   69|
|   516|  149|
|    85|  107|
|   137|   80|
|   251|  119|
|   451|   52|
|   580|  922|
|    65|   27|
|   458|   76|
|    53|   46|
|   255|  145|
|   481|  436|
+------+-----+
only showing top 20 rows


In [9]:
(train, test) = df_ratings_small.randomSplit([0.8, 0.2], seed=42)

In [13]:
als = ALS(
    maxIter=10,
    regParam=0.1,
    rank=15,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop", 
    implicitPrefs=False, 

    )


In [14]:
# Train model
model = als.fit(train)

In [15]:
# Test model
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.4f}")

Root-mean-square error = 0.9163


In [18]:
# Saving model for later use
model.save("models/ratings_small_model-latent-features-15")

25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/16 16:26:34 WARN MemoryManager: Total allocation exceeds 95.00%

In [19]:
## Get recommendations for users
userRecs = model.recommendForAllUsers(5)
print("User Recommendations:")
userRecs.show(5, truncate=False)

User Recommendations:
+------+---------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                    |
+------+---------------------------------------------------------------------------------------------------+
|1     |[{106920, 3.6661716}, {73290, 3.5237124}, {7087, 3.4215543}, {246, 3.3955903}, {3730, 3.3685875}]  |
|2     |[{83411, 4.965969}, {83359, 4.965969}, {83318, 4.965969}, {67504, 4.965969}, {7087, 4.6772537}]    |
|3     |[{83411, 4.6553082}, {83359, 4.6553082}, {83318, 4.6553082}, {67504, 4.6553082}, {8132, 4.4711556}]|
|4     |[{83411, 6.013468}, {83359, 6.013468}, {83318, 6.013468}, {67504, 6.013468}, {1192, 5.7722087}]    |
|5     |[{8535, 4.774306}, {1948, 4.7083797}, {65037, 4.6307316}, {59684, 4.6307316}, {2819, 4.612341}]    |
+------+--------------------------------------------------------------------------------------------------

                                                                                

In [20]:
## Getting movie metadata
df_movies_metadata  = spark.read.csv("the-movies-dataset/movies_metadata.csv", header=True, inferSchema=True)
df_movies_metadata.show()


+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+-----

In [None]:
## Selecting only relevant metadata
