# A Basic Movie Recommendation Program

In [41]:
from pyspark.sql import  SparkSession
from pyspark.sql.types import IntegerType, FloatType 
from pyspark.sql.functions import split, col


spark = SparkSession.builder.appName("MovieRecommender").getOrCreate()

ratings = spark.read.text("ml-1m/ratings.dat")

ratings = ratings.withColumn("value", split(col("value"), "::")) \
    .select(
        col("value")[0].cast(IntegerType()).alias("user"),
        col("value")[1].cast(IntegerType()).alias("product"),
        col("value")[2].cast(FloatType()).alias("rating")
    )

ratings.show(10, truncate=False)



+----+-------+------+
|user|product|rating|
+----+-------+------+
|1   |1193   |5.0   |
|1   |661    |3.0   |
|1   |914    |3.0   |
|1   |3408   |4.0   |
|1   |2355   |5.0   |
|1   |1197   |3.0   |
|1   |1287   |5.0   |
|1   |2804   |5.0   |
|1   |594    |4.0   |
|1   |919    |4.0   |
+----+-------+------+
only showing top 10 rows



In [42]:
movies = spark.read.text("ml-1m/movies.dat")

# Process the data
movies = movies.withColumn("value", split(col("value"), "::")) \
    .select(
        col("value")[0].cast(IntegerType()).alias("movie"),
        col("value")[1].alias("title"),
        split(col("value")[2], "\\|").alias("genre")
    )

movies.show(10, truncate=False)

+-----+----------------------------------+--------------------------------+
|movie|title                             |genre                           |
+-----+----------------------------------+--------------------------------+
|1    |Toy Story (1995)                  |[Animation, Children's, Comedy] |
|2    |Jumanji (1995)                    |[Adventure, Children's, Fantasy]|
|3    |Grumpier Old Men (1995)           |[Comedy, Romance]               |
|4    |Waiting to Exhale (1995)          |[Comedy, Drama]                 |
|5    |Father of the Bride Part II (1995)|[Comedy]                        |
|6    |Heat (1995)                       |[Action, Crime, Thriller]       |
|7    |Sabrina (1995)                    |[Comedy, Romance]               |
|8    |Tom and Huck (1995)               |[Adventure, Children's]         |
|9    |Sudden Death (1995)               |[Action]                        |
|10   |GoldenEye (1995)                  |[Action, Adventure, Thriller]   |
+-----+-----

In [43]:
animated_movies = movies.rdd \
    .filter(lambda row: "Animation" in row["genre"]) \
    .take(10)

for movie in animated_movies:
    print(movie)

Row(movie=1, title='Toy Story (1995)', genre=['Animation', "Children's", 'Comedy'])
Row(movie=13, title='Balto (1995)', genre=['Animation', "Children's"])
Row(movie=48, title='Pocahontas (1995)', genre=['Animation', "Children's", 'Musical', 'Romance'])
Row(movie=239, title='Goofy Movie, A (1995)', genre=['Animation', "Children's", 'Comedy', 'Romance'])
Row(movie=244, title='Gumby: The Movie (1995)', genre=['Animation', "Children's"])
Row(movie=313, title='Swan Princess, The (1994)', genre=['Animation', "Children's"])
Row(movie=364, title='Lion King, The (1994)', genre=['Animation', "Children's", 'Musical'])
Row(movie=558, title='Pagemaster, The (1994)', genre=['Action', 'Adventure', 'Animation', "Children's", 'Fantasy'])
Row(movie=588, title='Aladdin (1992)', genre=['Animation', "Children's", 'Comedy', 'Musical'])
Row(movie=594, title='Snow White and the Seven Dwarfs (1937)', genre=['Animation', "Children's", 'Musical'])


In [44]:
personal_ratings = [
    ("Toy Story (1995)", 5.0),
    ("Jumanji (1995)", 4.0),
    ("Pocahontas (1995)", 5.0),
    ("Aladdin (1992)", 5.0),
    ("Inferno (1980)", 1.0),
    ("Balto (1995)", 5.0),
    ("Man of the Year (1995)", 5.0),
    ("Mortal Kombat (1995)", 1.0),
    ("Ace Ventura: When Nature Calls (1995)", 1.0),
    ("Heat (1995)", 1.0),
    ("Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)", 1.0),
    ("Saving Private Ryan (1998)", 1.0),
    ("Sixth Sense, The (1999)", 1.0),
    ("Wrong Trousers, The (1993)", 1.0),
    ("Indiana Jones and the Last Crusade (1989)", 1.0),
    ("Godfather, The (1972)", 1.0),
    ("Fugitive, The (1993)", 1.0),
    ("Silence of the Lambs, The (1991)", 2.0)
]

columns = ["title", "rating"]

personal_ratings = spark.createDataFrame(personal_ratings, columns)

In [45]:
from pyspark.sql.functions import lit


user_id = 0

# Join `personal_ratings` with `movies` on the "title" column
normalized_ratings = personal_ratings.join(movies, on="title") \
    .select(
        lit(user_id).alias("user"), 
        movies["movie"].alias("product"),
        personal_ratings["rating"]
    )



print(normalized_ratings.count())
movie_ids = normalized_ratings.select("product").rdd.map(lambda row: row["product"]).collect()

print("Movie IDs:", movie_ids)

18
Movie IDs: [1, 2, 48, 588, 3587, 13, 137, 44, 19, 6, 2019, 2028, 2762, 1148, 1291, 858, 457, 593]


In [46]:

# Split the ratings DataFrame into 90% training and 10% test
set = ratings.randomSplit([0.9, 0.1], seed=12345)

training = normalized_ratings.union(set[0]).cache()


training.show(10)

test = set[1].cache()

test.show(5)

print(f"Training: {training.count()}, Test: {test.count()}")

                                                                                

+----+-------+------+
|user|product|rating|
+----+-------+------+
|   0|      1|   5.0|
|   0|      2|   4.0|
|   0|     48|   5.0|
|   0|    588|   5.0|
|   0|   3587|   1.0|
|   0|     13|   5.0|
|   0|    137|   5.0|
|   0|     44|   1.0|
|   0|     19|   1.0|
|   0|      6|   1.0|
+----+-------+------+
only showing top 10 rows

+----+-------+------+
|user|product|rating|
+----+-------+------+
|   1|   1207|   4.0|
|   1|   1246|   4.0|
|   1|   2028|   5.0|
|   1|   2804|   5.0|
|   1|   3114|   4.0|
+----+-------+------+
only showing top 5 rows

Training: 900442, Test: 99785


In [47]:
from pyspark.ml.recommendation import ALS

# Define ALS model parameters
rank = 40
num_iterations = 20
regularization = 0.09

# Initialize ALS
als = ALS(
    rank=rank,
    maxIter=num_iterations,
    regParam=regularization,
    userCol="user",
    itemCol="product",
    ratingCol="rating",
    coldStartStrategy="drop" 
)

model = als.fit(training)

# Model type
print(type(model))

<class 'pyspark.ml.recommendation.ALSModel'>


In [48]:

users_products = movies.select(lit(user_id).alias("user"), movies["movie"]).rdd.map(lambda row: (row[0], row[1]))




users_products_df = users_products.toDF(["user", "product"])

users_products_df = users_products_df.join(normalized_ratings, on=["user","product"]).select('user','product', 'rating')


predictions = model.transform(users_products_df)

predictions.show()

+----+-------+------+----------+
|user|product|rating|prediction|
+----+-------+------+----------+
|   0|      1|   5.0| 3.1944888|
|   0|      2|   4.0|  2.824113|
|   0|    588|   5.0| 3.4858088|
|   0|     48|   5.0| 3.9995697|
|   0|   3587|   1.0| 1.1961099|
|   0|     13|   5.0| 3.9351368|
|   0|    137|   5.0|     4.768|
|   0|     44|   1.0| 1.2184556|
|   0|      6|   1.0|0.96165204|
|   0|     19|   1.0| 0.9759796|
|   0|   2019|   1.0| 1.5785863|
|   0|   2028|   1.0| 1.5123614|
|   0|   1148|   1.0| 1.7745425|
|   0|   2762|   1.0|  2.010128|
|   0|    858|   1.0| 1.3620778|
|   0|    593|   2.0| 1.6572498|
|   0|   1291|   1.0| 1.7261155|
|   0|    457|   1.0| 1.9003179|
+----+-------+------+----------+



In [49]:
#evaluating our model 
from pyspark.ml.evaluation import RegressionEvaluator


evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)

print(f"Root-mean-square error = {rmse}")


Root-mean-square error = 0.8503642724718667


In [50]:
# Filter the predictions for user ID 0
df = predictions.filter(col("user") == 0)

sorted_df = df.orderBy(col("prediction").desc())

result_df = (
    sorted_df.join(movies, sorted_df["product"] == movies["movie"])
    .select("movie", "title", "genre", 'prediction')
)


result_df.show(truncate=False)


+-----+-------------------------------------------------------------------+-----------------------------------------+----------+
|movie|title                                                              |genre                                    |prediction|
+-----+-------------------------------------------------------------------+-----------------------------------------+----------+
|1    |Toy Story (1995)                                                   |[Animation, Children's, Comedy]          |3.1944888 |
|2    |Jumanji (1995)                                                     |[Adventure, Children's, Fantasy]         |2.824113  |
|588  |Aladdin (1992)                                                     |[Animation, Children's, Comedy, Musical] |3.4858088 |
|48   |Pocahontas (1995)                                                  |[Animation, Children's, Musical, Romance]|3.9995697 |
|3587 |Inferno (1980)                                                     |[Horror]              

In [51]:
from pyspark.sql.functions import explode


# Generate top 10 recommendations for user 0
user_recommendations = model.recommendForAllUsers(10)
user_recommendations = user_recommendations.filter("user = 0")
user_recommendations = user_recommendations.select(col("user"), explode(col("recommendations").alias("recommendation"))) \
.select(col("user"),
        col("col.product").alias("product"),
        col("col.rating").alias("predicted_rating")
       )

user_recommendations = user_recommendations.join(
    movies,
    user_recommendations["product"] == movies["movie"]
).select("user", "movie", "title", "genre", 'predicted_rating')
user_recommendations.show(truncate=False)

# Generate top 10 recommendations for  product 138
item_recommendations = model.recommendForAllItems(5)
item_recommendations = item_recommendations.filter("product = 138")
item_recommendations.show(truncate=False)

                                                                                

+----+-----+--------------------------------+-------------------------------------------------+----------------+
|user|movie|title                           |genre                                            |predicted_rating|
+----+-----+--------------------------------+-------------------------------------------------+----------------+
|0   |137  |Man of the Year (1995)          |[Documentary]                                    |4.7680006       |
|0   |343  |Baby-Sitters Club, The (1995)   |[Children's]                                     |4.1542983       |
|0   |2059 |Parent Trap, The (1998)         |[Children's, Drama]                              |4.100318        |
|0   |831  |Stonewall (1995)                |[Drama]                                          |4.0957785       |
|0   |2101 |Squanto: A Warrior's Tale (1994)|[Adventure, Drama]                               |4.0034547       |
|0   |48   |Pocahontas (1995)               |[Animation, Children's, Musical, Romance]        |3



+-------+----------------------------------------------------------------------------------------------+
|product|recommendations                                                                               |
+-------+----------------------------------------------------------------------------------------------+
|138    |[{5606, 3.753554}, {2151, 3.7440114}, {3473, 3.5514183}, {4169, 3.5469933}, {2549, 3.5191498}]|
+-------+----------------------------------------------------------------------------------------------+



                                                                                

In [None]:
spark.stop()