In [0]:
from pyspark.sql import functions as F

df = spark.table("gold_fact_ratings")

df.show(5)


+------+-------+------+---------+-------------------+-----------+
|userId|movieId|rating|timestamp|       timestamp_ts|rating_date|
+------+-------+------+---------+-------------------+-----------+
|   236|   2718|   3.0|943015211|1999-11-19 12:40:11| 1999-11-19|
|   236|   2542|   5.0|943015062|1999-11-19 12:37:42| 1999-11-19|
|   236|   2459|   4.0|943013896|1999-11-19 12:18:16| 1999-11-19|
|   236|   2580|   5.0|943015701|1999-11-19 12:48:21| 1999-11-19|
|   236|   1350|   4.0|943013896|1999-11-19 12:18:16| 1999-11-19|
+------+-------+------+---------+-------------------+-----------+
only showing top 5 rows


In [0]:
df.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timestamp_ts: timestamp (nullable = true)
 |-- rating_date: date (nullable = true)



In [0]:
train, test = df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.recommendation import ALS


In [0]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop",
    rank=10,
    maxIter=10,
    regParam=0.1
)


In [0]:
model = als.fit(train)


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)

print("RMSE:", rmse)


RMSE: 0.7752334453818759


In [0]:
users = df.select("userId").distinct()


In [0]:
from pyspark.sql.functions import col

ratings = spark.table("gold_fact_ratings")


In [0]:
users = df.select("userId").distinct()
movies = df.select("movieId").distinct()


In [0]:
user_movie_pairs = users.crossJoin(movies)


In [0]:
candidates = (
    user_movie_pairs
    .join(ratings, ["userId", "movieId"], "left_anti")
)


In [0]:
predictions = model.transform(candidates)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy("userId").orderBy(col("prediction").desc())

top_recs = (
    predictions
    .withColumn("rank", row_number().over(window))
    .filter(col("rank") <= 10)
    .drop("rank")
)


In [0]:
top_recs.write.mode("overwrite").saveAsTable("gold_user_recommendations")


In [0]:
print(model)


ALSModel: uid=ALS_a6e201edc16d, rank=10


In [0]:
df_movies = spark.table("silver_dim_movies_enriched")

recs_with_titles = top_recs.join(
    df_movies.select("movieId", "title"),
    "movieId",
    "left"
)

recs_with_titles.show(20, truncate=False)


+-------+------+----------+-----------------------------------------------------------+
|movieId|userId|prediction|title                                                      |
+-------+------+----------+-----------------------------------------------------------+
|3379   |1     |5.8383117 |On the Beach (1959)                                        |
|170355 |1     |5.8383117 |Mulholland Dr. (1999)                                      |
|33649  |1     |5.816472  |Saving Face (2004)                                         |
|132333 |1     |5.691397  |Seve (2014)                                                |
|5490   |1     |5.691397  |The Big Bus (1976)                                         |
|3653   |1     |5.611389  |Endless Summer, The (1966)                                 |
|27523  |1     |5.5818534 |My Sassy Girl (Yeopgijeogin geunyeo) (2001)                |
|84847  |1     |5.568922  |Emma (2009)                                                |
|171495 |1     |5.558476  |Cosmo

In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_fact_ratings  |  MANAGED  |  False
gold_power_users  |  MANAGED  |  False
gold_user_recommendations  |  MANAGED  |  False


In [0]:
# Fact ratings
spark.table("gold_fact_ratings").show(5)
spark.table("gold_fact_ratings").count()

# Recommendations
spark.table("gold_user_recommendations").show(5, truncate=False)
spark.table("gold_user_recommendations").count()


+------+-------+------+----------+-------------------+-----------+
|userId|movieId|rating| timestamp|       timestamp_ts|rating_date|
+------+-------+------+----------+-------------------+-----------+
|   249| 114074|   4.0|1425894202|2015-03-09 09:43:22| 2015-03-09|
|   249| 129354|   4.5|1425894209|2015-03-09 09:43:29| 2015-03-09|
|   236|   2718|   3.0| 943015211|1999-11-19 12:40:11| 1999-11-19|
|   236|   2542|   5.0| 943015062|1999-11-19 12:37:42| 1999-11-19|
|   236|   2459|   4.0| 943013896|1999-11-19 12:18:16| 1999-11-19|
+------+-------+------+----------+-------------------+-----------+
only showing top 5 rows
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|1     |3379   |5.8383117 |
|1     |170355 |5.8383117 |
|1     |33649  |5.816472  |
|1     |132333 |5.691397  |
|1     |5490   |5.691397  |
+------+-------+----------+
only showing top 5 rows


6100

In [0]:
from pyspark.sql import functions as F

spark.table("gold_fact_ratings") \
    .groupBy("movieId") \
    .agg(
        F.avg("rating").alias("avg_rating"),
        F.count("rating").alias("num_ratings")
    ) \
    .orderBy(F.desc("avg_rating"), F.desc("num_ratings")) \
    .show(10)


+-------+----------+-----------+
|movieId|avg_rating|num_ratings|
+-------+----------+-----------+
|   6818|       5.0|          2|
|   1151|       5.0|          2|
|   6442|       5.0|          2|
|  78836|       5.0|          2|
|     53|       5.0|          2|
|     99|       5.0|          2|
|   3473|       5.0|          2|
|  96608|       5.0|          1|
|   3303|       5.0|          1|
| 126921|       5.0|          1|
+-------+----------+-----------+
only showing top 10 rows


In [0]:
import mlflow

with mlflow.start_run():
    mlflow.log_param("rank", model.rank)
    mlflow.log_param("regParam", getattr(model, 'regParam', 'not available'))
    mlflow.log_metric("rmse", 0.8123190612289932)


In [0]:
model_path = "/FileStore/tables/movie-recommender-data-raw/gold_als_model"

# Remove trailing slash if it exists
model.write().overwrite().save(model_path)

print("Model saved successfully")


Model saved successfully


In [0]:
dbutils.fs.ls("/FileStore/tables/movie-recommender-data-raw/gold_als_model")


[FileInfo(path='dbfs:/FileStore/tables/movie-recommender-data-raw/gold_als_model/itemFactors/', name='itemFactors/', size=0, modificationTime=1771413193000),
 FileInfo(path='dbfs:/FileStore/tables/movie-recommender-data-raw/gold_als_model/metadata/', name='metadata/', size=0, modificationTime=1771413191000),
 FileInfo(path='dbfs:/FileStore/tables/movie-recommender-data-raw/gold_als_model/userFactors/', name='userFactors/', size=0, modificationTime=1771413192000)]

In [0]:
from pyspark.ml.recommendation import ALSModel

loaded_model = ALSModel.load("/FileStore/tables/movie-recommender-data-raw/gold_als_model")

print(loaded_model)


ALSModel: uid=ALS_a6e201edc16d, rank=10


In [0]:
from pyspark.sql.functions import count, desc

ratings = spark.table("gold_fact_ratings")

top_users = (
    ratings
    .groupBy("userId")
    .agg(count("*").alias("rating_count"))
    .orderBy(desc("rating_count"))
)

top_users.show(10)


+------+------------+
|userId|rating_count|
+------+------------+
|   414|        2698|
|   599|        2478|
|   474|        2108|
|   448|        1864|
|   274|        1346|
|   610|        1302|
|    68|        1260|
|   380|        1218|
|   606|        1115|
|   288|        1055|
+------+------------+
only showing top 10 rows


In [0]:
top_users.write.mode("overwrite").saveAsTable("gold_power_users")


In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_fact_ratings  |  MANAGED  |  False
gold_power_users  |  MANAGED  |  False
gold_user_recommendations  |  MANAGED  |  False


In [0]:
from pyspark.sql.functions import count, desc

recs = spark.table("gold_user_recommendations")

top_movies = (
    recs
    .groupBy("movieId")
    .agg(count("*").alias("recommendation_count"))
    .orderBy(desc("recommendation_count"))
)

top_movies.show(20)


+-------+--------------------+
|movieId|recommendation_count|
+-------+--------------------+
| 170355|                 312|
|   3379|                 304|
|   7842|                 276|
|  33649|                 255|
|   1066|                 158|
|  89904|                 140|
|  72171|                 136|
|  78836|                 132|
|  60943|                 106|
|   7096|                 106|
|  84847|                 104|
| 138966|                 104|
| 179135|                 103|
|   7841|                  99|
|  59018|                  93|
|   7748|                  89|
|   4256|                  86|
| 171495|                  85|
|   6650|                  84|
|   4495|                  76|
+-------+--------------------+
only showing top 20 rows


In [0]:
movies = spark.table("silver_dim_movies_enriched")

top_movies_with_titles = (
    top_movies
    .join(movies, "movieId", "left")
)

top_movies_with_titles.show(20)


+-------+--------------------+--------------------+--------------------+--------------------+-------+------+------+--------+------+
|movieId|recommendation_count|               title|              genres|        genres_array| imdbId|tmdbId|actors|director|poster|
+-------+--------------------+--------------------+--------------------+--------------------+-------+------+------+--------+------+
|   5222|                  14|Kissing Jessica S...|      Comedy|Romance|   [Comedy, Romance]|0264761| 15647|  NULL|    NULL|  NULL|
|    611|                   1|Hellraiser: Blood...|Action|Horror|Sci-Fi|[Action, Horror, ...|0116514|  8766|  NULL|    NULL|  NULL|
|   2936|                  15|Sullivan's Travel...|Adventure|Comedy|...|[Adventure, Comed...|0034240| 16305|  NULL|    NULL|  NULL|
|  84273|                  27|Zeitgeist: Moving...|         Documentary|       [Documentary]|1781069| 54293|  NULL|    NULL|  NULL|
|  51931|                  63|Reign Over Me (2007)|               Drama|    

In [0]:
top_movies_with_titles.show(5)
top_movies_with_titles.printSchema()


+-------+--------------------+--------------------+--------------------+--------------------+-------+------+------+--------+------+
|movieId|recommendation_count|               title|              genres|        genres_array| imdbId|tmdbId|actors|director|poster|
+-------+--------------------+--------------------+--------------------+--------------------+-------+------+------+--------+------+
|   5222|                  14|Kissing Jessica S...|      Comedy|Romance|   [Comedy, Romance]|0264761| 15647|  NULL|    NULL|  NULL|
|    611|                   1|Hellraiser: Blood...|Action|Horror|Sci-Fi|[Action, Horror, ...|0116514|  8766|  NULL|    NULL|  NULL|
|   2936|                  15|Sullivan's Travel...|Adventure|Comedy|...|[Adventure, Comed...|0034240| 16305|  NULL|    NULL|  NULL|
|  84273|                  27|Zeitgeist: Moving...|         Documentary|       [Documentary]|1781069| 54293|  NULL|    NULL|  NULL|
|  51931|                  63|Reign Over Me (2007)|               Drama|    

In [0]:
top_movies_with_titles.select("movieId", "title", "tmdbId").write.mode("overwrite").option("header", True).csv("/FileStore/tmp/top_movies_scrape.csv")

In [0]:
# Read the saved CSV into a DataFrame
df_check = spark.read.option("header", True).csv("/FileStore/tmp/top_movies_scrape.csv")
display(df_check.limit(10))  # Show first 10 rows


movieId,title,tmdbId
5222,Kissing Jessica Stein (2001),15647
611,Hellraiser: Bloodline (1996),8766
2936,Sullivan's Travels (1941),16305
84273,Zeitgeist: Moving Forward (2011),54293
51931,Reign Over Me (2007),2355
107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",233063
7841,Children of Dune (2003),192936
26133,"Charlie Brown Christmas, A (1965)",13187
3096,My Man Godfrey (1957),52470
5992,"Hours, The (2002)",590


In [0]:
from pyspark.sql.functions import desc

top_500_movies = top_movies_with_titles \
    .orderBy(desc("recommendation_count")) \
    .limit(500) \
    .select("movieId", "title", "tmdbId")


In [0]:
top_500_movies.coalesce(1).write.mode("overwrite").option("header", True).csv("/FileStore/tmp/top_500_movies.csv")

# https://adb-7405616516337746.6.azuredatabricks.net/files/tmp/top_500_movies.csv
# direct link for the created file from which it can be downloaded

In [0]:
files = dbutils.fs.ls("/FileStore/tmp/top_500_movies.csv")
csv_file = [f.path for f in files if f.name.endswith(".csv")][0]  # pick the part-00000 file

# Move to root for easy download
dbutils.fs.cp(csv_file, "/FileStore/top_500_movies_final.csv")


True

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df_movies = spark.read.option("header", True).csv("/FileStore/tables/movie-recommender-data-raw/top_500_movies_scraped.csv")
df_movies.show(5)


+------+--------------+--------------------+------------+--------------------+-------+------------+--------------------+
|tmdbId|         title|              genres|release_date|            overview|runtime|vote_average|         poster_path|
+------+--------------+--------------------+------------+--------------------+-------+------------+--------------------+
| 35412|  On the Beach|['Science Fiction...|  1959-12-17|In 1964, atomic w...|    134|       6.659|/lTDuj5zalrI0fpQe...|
| 19316|   Saving Face|['Comedy', 'Roman...|  2004-09-12|A Chinese-America...|     97|       7.271|/7XbbgkKi4nEMOu9i...|
| 31530|Shall We Dance|['Comedy', 'Roman...|  1937-05-07|Ballet star Petro...|    109|       7.264|/6anwn2NxTw7NwVEE...|
| 74643|    The Artist|['Drama', 'Comedy...|  2011-10-12|Hollywood, 1927: ...|    100|       7.443|/A7zDHjlYPi1peLyB...|
| 24804|Black Dynamite|['Comedy', 'Action']|  2009-10-16|This is the story...|     85|       7.097|/u3oWQDz0JggzzsVl...|
+------+--------------+---------

In [0]:
df_movies.printSchema()


root
 |-- tmdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- poster_path: string (nullable = true)



In [0]:
from pyspark.sql.functions import col

df_movies = df_movies.withColumn("tmdbId", col("tmdbId").cast("int"))


In [0]:
df_ratings = spark.table("gold_fact_ratings")
df_links   = spark.table("gold_dim_movies")  # must contain movieId + tmdbId

df_movies_joined = df_links.join(
    df_movies,
    on="tmdbId",
    how="inner"
)


In [0]:
df_links = spark.read.option("header", True) \
    .csv("/FileStore/tables/movie-recommender-data-raw/links.csv")

df_links.printSchema()


root
 |-- movieId: string (nullable = true)
 |-- imdbId: string (nullable = true)
 |-- tmdbId: string (nullable = true)



In [0]:
from pyspark.sql.functions import col

df_links = df_links \
    .withColumn("movieId", col("movieId").cast("int")) \
    .withColumn("tmdbId", col("tmdbId").cast("int"))


In [0]:
df_movies = df_movies.withColumn("tmdbId", col("tmdbId").cast("int"))


In [0]:
df_movies_enriched = df_links.join(
    df_movies,
    on="tmdbId",
    how="inner"
)


In [0]:
df_movies_enriched.write.mode("overwrite").saveAsTable("gold_dim_movies_enriched")


In [0]:
from pyspark.sql.functions import col

df_links = df_links \
    .withColumn("movieId", col("movieId").cast("int")) \
    .withColumn("tmdbId", col("tmdbId").cast("int"))

df_movies = df_movies.withColumn("tmdbId", col("tmdbId").cast("int"))

df_movies_enriched = df_links.join(
    df_movies,
    on="tmdbId",
    how="inner"
)

df_movies_enriched.show(5)


+------+-------+-------+--------------+--------------------+------------+--------------------+-------+------------+--------------------+
|tmdbId|movieId| imdbId|         title|              genres|release_date|            overview|runtime|vote_average|         poster_path|
+------+-------+-------+--------------+--------------------+------------+--------------------+-------+------------+--------------------+
| 35412|   3379|0053137|  On the Beach|['Science Fiction...|  1959-12-17|In 1964, atomic w...|    134|       6.659|/lTDuj5zalrI0fpQe...|
| 19316|  33649|0384504|   Saving Face|['Comedy', 'Roman...|  2004-09-12|A Chinese-America...|     97|       7.271|/7XbbgkKi4nEMOu9i...|
| 31530|   1066|0029546|Shall We Dance|['Comedy', 'Roman...|  1937-05-07|Ballet star Petro...|    109|       7.264|/6anwn2NxTw7NwVEE...|
| 74643|  89904|1655442|    The Artist|['Drama', 'Comedy...|  2011-10-12|Hollywood, 1927: ...|    100|       7.443|/A7zDHjlYPi1peLyB...|
| 24804|  72171|1190536|Black Dynamite|['

In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_dim_movies_enriched  |  MANAGED  |  False
gold_fact_ratings  |  MANAGED  |  False
gold_power_users  |  MANAGED  |  False
gold_user_recommendations  |  MANAGED  |  False


In [0]:
ratings = spark.table("gold_fact_ratings")
ratings.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- timestamp_ts: timestamp (nullable = true)
 |-- rating_date: date (nullable = true)



In [0]:
from pyspark.ml.recommendation import ALS

ratings = spark.table("gold_fact_ratings")

train, test = ratings.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=10,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop"
)

model = als.fit(train)


In [0]:
user_recs = model.recommendForAllUsers(10)


In [0]:
ratings = spark.table("gold_fact_ratings")

users = ratings.select("userId").distinct()


In [0]:
user_recs = model.recommendForUserSubset(users, 10)


In [0]:
ratings = spark.table("gold_fact_ratings")

users = ratings.select("userId").distinct()
movies = ratings.select("movieId").distinct()

user_movie_pairs = users.crossJoin(movies)


In [0]:
predictions = model.transform(user_movie_pairs)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

window_spec = Window.partitionBy("userId").orderBy(col("prediction").desc())

top10 = predictions.withColumn(
    "rank",
    row_number().over(window_spec)
).filter("rank <= 10").drop("rank")

top10.show(5)


+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|     1|   3379| 5.8713207|
|     1| 170355| 5.8713207|
|     1|  33649|  5.712371|
|     1| 171495| 5.5977917|
|     1|   6442|  5.506807|
+------+-------+----------+
only showing top 5 rows


In [0]:
top10.write.mode("overwrite") \
    .saveAsTable("gold_user_recommendations_flat")


In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_dim_movies_enriched  |  MANAGED  |  False
gold_fact_ratings  |  MANAGED  |  False
gold_power_users  |  MANAGED  |  False
gold_user_recommendations  |  MANAGED  |  False
gold_user_recommendations_flat  |  MANAGED  |  False


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print("RMSE:", rmse)


RMSE: 0.7977473495200248


In [0]:
spark.table("gold_user_recommendations_flat").printSchema()
spark.table("gold_user_recommendations_flat").show(5)


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|     1|   3379| 5.8713207|
|     1| 170355| 5.8713207|
|     1|  33649|  5.712371|
|     1| 171495| 5.5977917|
|     1|   6442|  5.506807|
+------+-------+----------+
only showing top 5 rows


In [0]:
recs = spark.table("gold_user_recommendations_flat")
movies = spark.table("gold_dim_movies_enriched")

final_recs = recs.join(
    movies,
    on="movieId",
    how="left"
)


In [0]:
final_recs.write.mode("overwrite") \
    .saveAsTable("gold_user_recommendations_enriched")


In [0]:
tables = spark.catalog.listTables()
for t in tables:
    if t.name.startswith("gold_"):
        print(f"{t.name}  |  {t.tableType}  |  {t.isTemporary}")


gold_dim_movies_enriched  |  MANAGED  |  False
gold_fact_ratings  |  MANAGED  |  False
gold_power_users  |  MANAGED  |  False
gold_user_recommendations  |  MANAGED  |  False
gold_user_recommendations_enriched  |  MANAGED  |  False
gold_user_recommendations_flat  |  MANAGED  |  False


In [0]:
spark.sql("""
SELECT userId, title, genres, prediction
FROM gold_user_recommendations_enriched
WHERE userId = 1
ORDER BY prediction DESC
""").show(10, False)


+------+-------------------------+---------------------------------------+----------+
|userId|title                    |genres                                 |prediction|
+------+-------------------------+---------------------------------------+----------+
|1     |On the Beach             |['Science Fiction', 'Drama', 'Romance']|5.8713207 |
|1     |NULL                     |NULL                                   |5.8713207 |
|1     |Saving Face              |['Comedy', 'Romance', 'Drama']         |5.712371  |
|1     |NULL                     |NULL                                   |5.5977917 |
|1     |Belle Époque             |['Comedy', 'Drama', 'Romance']         |5.506807  |
|1     |My Sassy Girl            |['Drama', 'Comedy', 'Romance']         |5.4446435 |
|1     |Enter the Void           |['Fantasy', 'Drama']                   |5.42667   |
|1     |Black Dynamite           |['Comedy', 'Action']                   |5.42667   |
|1     |Nasu: Summer in Andalusia|['Animation', 'Actio

In [0]:
final_recs_clean = recs.join(
    movies,
    on="movieId",
    how="inner"  # only keep matches
)


In [0]:
# Assuming `final_recs_clean` is the joined DataFrame from Option A
# with movieId, userId, prediction, title, genres

# Save as managed table in your GOLD schema
final_recs_clean.write.mode("overwrite").saveAsTable("gold_user_recommendations_enriched")

# Verify
spark.table("gold_user_recommendations_enriched").show(10, truncate=False)


+-------+------+----------+------+-------+-------------------------+---------------------------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+--------------------------------+
|movieId|userId|prediction|tmdbId|imdbId |title                    |genres                                 |release_date|overview                                                                                                                                                                                                                                         

In [0]:
spark.sql("""
SELECT userId, title, genres, prediction
FROM gold_user_recommendations_enriched
WHERE userId = 1
ORDER BY prediction DESC
""").show(10, False)


+------+-------------------------+---------------------------------------+----------+
|userId|title                    |genres                                 |prediction|
+------+-------------------------+---------------------------------------+----------+
|1     |On the Beach             |['Science Fiction', 'Drama', 'Romance']|5.8713207 |
|1     |Saving Face              |['Comedy', 'Romance', 'Drama']         |5.712371  |
|1     |Belle Époque             |['Comedy', 'Drama', 'Romance']         |5.506807  |
|1     |My Sassy Girl            |['Drama', 'Comedy', 'Romance']         |5.4446435 |
|1     |Nasu: Summer in Andalusia|['Animation', 'Action']                |5.42667   |
|1     |Enter the Void           |['Fantasy', 'Drama']                   |5.42667   |
|1     |Black Dynamite           |['Comedy', 'Action']                   |5.42667   |
+------+-------------------------+---------------------------------------+----------+



In [0]:
# Total recommendations
spark.table("gold_user_recommendations_enriched").count()

# Check for any missing titles or genres
spark.table("gold_user_recommendations_enriched") \
     .filter("title IS NULL OR genres IS NULL") \
     .count()


0

In [0]:
spark.table("gold_user_recommendations_enriched") \
     .filter("userId = 1") \
     .orderBy("prediction", ascending=False) \
     .show(10, truncate=False)


+-------+------+----------+------+-------+-------------------------+---------------------------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+--------------------------------+
|movieId|userId|prediction|tmdbId|imdbId |title                    |genres                                 |release_date|overview                                                                                                                                                                                                                                         

In [0]:
from pyspark.sql.functions import count, desc

spark.table("gold_user_recommendations_enriched") \
     .groupBy("movieId", "title") \
     .agg(count("*").alias("recommendation_count")) \
     .orderBy(desc("recommendation_count")) \
     .show(10, truncate=False)


+-------+-------------------------+--------------------+
|movieId|title                    |recommendation_count|
+-------+-------------------------+--------------------+
|3379   |On the Beach             |322                 |
|33649  |Saving Face              |192                 |
|72171  |Black Dynamite           |144                 |
|78836  |Enter the Void           |141                 |
|138966 |Nasu: Summer in Andalusia|116                 |
|4256   |The Center of the World  |111                 |
|7096   |Rivers and Tides         |107                 |
|5915   |Escape to Victory        |88                  |
|132333 |Seve                     |75                  |
|5490   |The Big Bus              |70                  |
+-------+-------------------------+--------------------+
only showing top 10 rows


In [0]:
spark.table("gold_user_recommendations_enriched") \
     .select("userId", "movieId", "title", "genres", "prediction") \
     .coalesce(1) \
     .write.mode("overwrite") \
     .option("header", True) \
     .csv("/FileStore/tables/movie-recommender/movie-recommender-final.csv")
