In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [5]:
# Creating session
spark = SparkSession.builder.appName("Movie Recommendation App").getOrCreate()

In [9]:

# Reading the ratings_small csv file

df_ratings_small  = spark.read.csv("the-movies-dataset/ratings_small.csv", header=True, inferSchema=True)

df_ratings_small.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows


In [None]:
#counting data
ratings_counts = df_ratings_small.select("rating").count()
users_count = df_ratings_small.select("userId").distinct().count()
movies_count = df_ratings_small.select("movieId").distinct().count()

print(f"ratings counts: {ratings_counts}\n user counts: {users_count}\n movies count: {movies_count}")

In [None]:
df_ratings_small.groupBy("userId").count().show()

In [None]:
(train, test) = df_ratings_small.randomSplit([0.8, 0.2], seed=42)

In [None]:
als = ALS(
    maxIter=10,
    regParam=0.1,
    rank=15,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop", 
    implicitPrefs=False, 

    )


In [None]:
# Train model
model = als.fit(train)

In [None]:
# Test model
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.4f}")

In [None]:
# Saving model for later use
model.write().overwrite().save("models/ratings_small_model-latent-features-15")

In [None]:
## Get recommendations for users
userRecs = model.recommendForAllUsers(5)
print("User Recommendations:")
userRecs.show(5, truncate=False)

In [None]:
from pyspark.sql.functions import explode, col

# Flatten movie recommendations data frames
flatRecs = userRecs.withColumn("rec", explode(col("recommendations"))) \
    .select(
        col("userId"),
        col("rec.movieId").alias("id"),
        col("rec.rating").alias("predicted_rating")
    )

flatRecs.show(10, False)


In [None]:
## Getting movie metadata
df_movies_metadata  = spark.read.csv("the-movies-dataset/movies_metadata.csv", header=True, inferSchema=True)
df_movies_metadata.show()


In [None]:
## Selecting only relevant metadata
selected_metadata= df_movies_metadata.select(['id','title', 'poster_path'])
selected_metadata.show()

In [None]:
recommendations = flatRecs.join(selected_metadata, on ="id", how = "inner")
recommendations.show(10)