1. Imports

In [12]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_json, col, explode, collect_list,concat_ws,lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import sum as _sum

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

2. Data Processing

In [21]:
spark = init_spark()
file = spark.read.csv(os.path.join("data/movies_metadata.csv"), header=True, inferSchema=True)

# Select columns we are interested in
movies_df = file.select("id", "title", "genres","original_title").distinct()
genres_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

# Convert the "genres" column to an array of structs
movies_df = movies_df.withColumn("genres", from_json(col("genres"), ArrayType(genres_schema)))

movies_df = movies_df.select("id","original_title", "title", explode(col("genres")).alias("genre")) \
               .select("id","original_title", "title", col("genre.name").alias("genre_name")) \
               .dropna()

# Group the data by id and title, and collect the genre names into a list
movies_df = movies_df.groupBy("id", "title","original_title").agg(collect_list("genre_name").alias("genres"))

# Combine the genre names into one string
movies_df = movies_df.withColumn("genres", concat_ws(",", "genres"))

+------+--------------------+--------------------+--------------------+
|    id|               title|      original_title|              genres|
+------+--------------------+--------------------+--------------------+
|100010|      Flight Command|      Flight Command|           Drama,War|
| 10004|         Desperation|         Desperation|Drama,Fantasy,Hor...|
|100042|  Dumb and Dumber To|  Dumb and Dumber To|              Comedy|
| 10005|Behind Enemy Line...|Behind Enemy Line...|Action,Adventure,...|
|100063|            Blackout|            Blackout|     Action,Thriller|
| 10013|Peggy Sue Got Mar...|Peggy Sue Got Mar...|Comedy,Drama,Fant...|
| 10014|A Nightmare on El...|A Nightmare on El...|              Horror|
| 10015|    Heartbreak Ridge|    Heartbreak Ridge|Action,Comedy,Dra...|
| 10016|      Ghosts of Mars|      Ghosts of Mars|Action,Horror,Sci...|
|100167|         No Way Home|         No Way Home|               Drama|
|100196|           Julian Po|           Julian Po|Comedy,Drama,R

3. Recommender System (Content-Based)

In [23]:
def recommend_movies_weight_matrix(movies_df, moviesInput, inputRatings):

    # Format df so that each column is a genre
    genres = movies_df.select("genres").rdd.flatMap(lambda x: x[0].split(',')).distinct().collect()

    for genre in genres:
        movies_df = movies_df.withColumn(genre, col("genres").contains(genre).cast("int"))

    
    for i,movie in enumerate(moviesInput):
        rating = inputRatings[i]
        movie_input_df = movies_df.filter(col('title') == movie)
        for genre in genres:
            movie_input_df = movie_input_df.withColumn(genre, col(genre) * rating)
        movies_df = movies_df.filter(col('title') != movie).union(movie_input_df)
    
    user_input_movies_df = movies_df.filter(col('title').isin(moviesInput))

    # Dataframe with the sum of each column (HERE WE SHOULD ONLY ADD USER INPUT MOVIES AND NORMALIZE THE USER INPUT MOVIES)
    genre_sum_df = user_input_movies_df.select([_sum(genre).alias(genre) for genre in genres])
    row_values = genre_sum_df.first()

    # Sum of total
    result = 0
    for column_name in genre_sum_df.columns:
        value = row_values[column_name]
        result += value

    # Normalize each genres
    genre_sum_df_normalized = genre_sum_df.select(*[(col(genre) / result).alias(genre) for genre in genre_sum_df.columns])

    # Create a new dataframe that multiplies each genre value with the corresponding normalized value
    multiplied_df = movies_df
    for genre in genres:
        normalized_genre = genre_sum_df_normalized.select(genre).first()[0]
        multiplied_df = multiplied_df.withColumn(genre, col(genre) * normalized_genre)
    
    # Exclude the input movies from the recommendations AND keep ids in clust
    multiplied_df = multiplied_df.filter(~col('title').isin(moviesInput))

    # Sum values of a row and store it in a new column called sum then show top 10
    final_weighted_matrix = multiplied_df.withColumn('sum', sum(multiplied_df[col] for col in genres))
    final_weighted_matrix = final_weighted_matrix.orderBy("sum", ascending=False).limit(10).drop(col("title")).show(truncate=False)


# Input movies
moviesInput = ["Brother Bear", "Toy Story", "The Dark Knight", "The Fly"]

# Ratings for the input movies
ratings = [10, 9, 1, 1]

# Call the recommend_movies function
recommended_movies = recommend_movies_weight_matrix(movies_df, moviesInput, ratings)


+------+----------------------------------------------+-------------------------------------------------------------------------+-------+------------------+--------------------+-------------------+--------------------+-------+-------+-------+-----------+-------+-----+--------+-------------------+-------------------+---+--------------------+--------------------+-------+-----+------------------+------------------+
|id    |original_title                                |genres                                                                   |Mystery|Comedy            |Action              |Adventure          |Science Fiction     |Western|Fantasy|Romance|Documentary|Foreign|Music|TV Movie|Animation          |Drama              |War|Horror              |Thriller            |History|Crime|Family            |sum               |
+------+----------------------------------------------+-------------------------------------------------------------------------+-------+------------------+------------

In [15]:
    # # Train and fit to kmeans model (can change k and seed later)
    # kmeans = KMeans(k=20)
    # model = kmeans.fit(movies_df.select("features"))
    
    # # Create a dataframe with the input movies and their ratings
    # input_df = spark.createDataFrame(zip(moviesInput, ratings), schema=["title", "rating"])

    # # Join the input movies with the movies dataframe to get their features
    # input_features_df = input_df.join(movies_df, on="title", how="inner").select("id", "features", "rating")
    # print("input features df")
    # input_features_df.show()

    # print("ids of inputs")

    # inputIds = []
    # for row in input_features_df.select("id").collect():
    #     inputIds.append(row["id"])
    # print(inputIds)

    # # Make predictions for the input movies using the kmeans model
    # predictions_df = model.transform(input_features_df).drop("features")
    # print("predictions df")
    # predictions_df.show()

    # # Get the cluster label for the input movies
    # cluster_label = predictions_df.select("prediction").distinct().collect()[0]["prediction"]
    # print("cluster label")
    # print(cluster_label)

    # # Get the ids of the movies in the same cluster as the input movies
    # cluster_movies_df = model.transform(movies_df).filter(col("prediction") == cluster_label)
    # cluster_movies_ids = cluster_movies_df.select("id").rdd.flatMap(lambda x: x).collect()
    # print("cluster movies id")
    # print(cluster_movies_ids)

    # # Exclude the input movies from the recommendations AND keep ids in clust
    # recommendations_df = movies_df.filter(col('id').isin(cluster_movies_ids))
    # recommendations_df = recommendations_df.filter(~col('id').isin(inputIds))

    # print("recommendations df (Exclude the input movies from the recommendations AND keep ids in clust)")
    # recommendations_df.show(truncate=False)

    # #----------------------------------------------------------------------------------


    # # Join with the predictions dataframe to get the rating of the recommended movies
    # # PROBLEM HERE
    # recommendations_df = recommendations_df.join(predictions_df, on="id", how="inner").select("title", "genres", "prediction", "rating")
    # print("recommendations df (Join with the predictions dataframe to get the rating of the recommended movies)")
    # recommendations_df.show(truncate=False)

    # # Sort the recommended movies by their rating and select the top 10
    # top_recommendations = recommendations_df.sort(col("rating").desc()).limit(10)

    # print("top recommendations df")
    # top_recommendations.show(truncate=False)

    # return top_recommendations.select("title", "genres").rdd.map(lambda x: (x[0], x[1])).collect()