1. Imports

In [4]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, from_json, col, explode, collect_list,concat_ws,lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import sum as _sum

def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

2. Data Processing

In [5]:
spark = init_spark()
file = spark.read.csv(os.path.join("data/movies_metadata.csv"), header=True, inferSchema=True)

# Select columns we are interested in
movies_df = file.select("id", "title", "genres","original_title").distinct()
genres_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

# Convert the "genres" column to an array of structs
movies_df = movies_df.withColumn("genres", from_json(col("genres"), ArrayType(genres_schema)))

movies_df = movies_df.select("id","original_title", "title", explode(col("genres")).alias("genre")) \
               .select("id","original_title", "title", col("genre.name").alias("genre_name")) \
               .dropna()

# Group the data by id and title, and collect the genre names into a list
movies_df = movies_df.groupBy("id", "title","original_title").agg(collect_list("genre_name").alias("genres"))

# Combine the genre names into one string
movies_df = movies_df.withColumn("genres", concat_ws(",", "genres"))

3. Recommender System (Content-Based)

In [9]:
def recommend_movies_weight_matrix(movies_df, moviesInput, inputRatings, numberOfRecommendation):

    # Format df so that each column is a genre
    genres = movies_df.select("genres").rdd.flatMap(lambda x: x[0].split(',')).distinct().collect()

    # Create a PySpark dataframe containing the input titles
    original_input_df = movies_df.filter(col("title").isin(moviesInput))

    for genre in genres:
        movies_df = movies_df.withColumn(genre, col("genres").contains(genre).cast("int"))
        original_input_df = original_input_df.withColumn(genre, col("genres").contains(genre).cast("int"))

    # Multiply the inputRatings with movie matrix with each matrix
    for i,movie in enumerate(moviesInput):
        rating = inputRatings[i]
        movie_input_df = movies_df.filter(col('title') == movie)
        for genre in genres:
            movie_input_df = movie_input_df.withColumn(genre, col(genre) * rating)
        movies_df = movies_df.filter(col('title') != movie).union(movie_input_df)
    
    user_input_movies_with_ratings_df = movies_df.filter(col('title').isin(moviesInput))

    # Dataframe with the sum of each column (HERE WE SHOULD ONLY ADD USER INPUT MOVIES AND NORMALIZE THE USER INPUT MOVIES)
    genre_sum_df = user_input_movies_with_ratings_df.select([_sum(genre).alias(genre) for genre in genres])
    row_values = genre_sum_df.first()

    # Sum of total
    result = 0
    for column_name in genre_sum_df.columns:
        value = row_values[column_name]
        result += value

    # Normalize each genres
    genre_sum_df_normalized = genre_sum_df.select(*[(col(genre) / result).alias(genre) for genre in genre_sum_df.columns])

    # Create a new dataframe that multiplies each genre value with the corresponding normalized value
    multiplied_df = movies_df
    input_multiplied_df = original_input_df
    for genre in genres:
        normalized_genre = genre_sum_df_normalized.select(genre).first()[0]
        multiplied_df = multiplied_df.withColumn(genre, col(genre) * normalized_genre)
        input_multiplied_df = input_multiplied_df.withColumn(genre, col(genre) * normalized_genre)
    
    # Exclude the input movies from the recommendations AND keep ids in clust
    multiplied_df = multiplied_df.filter(~col('title').isin(moviesInput))
    
    print("Recommended Movies")
    # Sum values of a row and store it in a new column called sum then show top 10
    final_weighted_matrix = multiplied_df.withColumn('sum', sum(multiplied_df[col] for col in genres))
    final_weighted_matrix = final_weighted_matrix.orderBy("sum", ascending=False).limit(numberOfRecommendation)
    final_weighted_matrix.show(truncate=False)


    # Evaluation (compare the sum of the inputs to the sum of the suggested movies)
    print("compare original input without user ratings")
    final_original_inputs_weighted_matrix = input_multiplied_df.withColumn('sum', sum(input_multiplied_df[col] for col in genres))
    final_original_inputs_weighted_matrix = final_original_inputs_weighted_matrix.orderBy("sum", ascending=False)
    final_original_inputs_weighted_matrix.show(truncate=False)
    
    return final_weighted_matrix

# # Input movies and Ratings for the input movies
moviesInput = ["Brother Bear", "Toy Story", "The Dark Knight"]
ratings = [10, 9, 1]
numberOfRecommendation = 100

# # Call the recommend_movies function
recommended_movies_weight_matrix = recommend_movies_weight_matrix(movies_df, moviesInput, ratings, numberOfRecommendation)



Recommended Movies
+------+--------------------------------------------------------------------------------------------------------------------------+------------------------------------+-------------------------------------------------------------------------+-------+-------------------+-------------------+-------------------+---------------+-------+-------+-------+-----------+-------+-----+--------+------------------+-------------------+---+------+-------------------+-------+-----+------------------+------------------+
|id    |title                                                                                                                     |original_title                      |genres                                                                   |Mystery|Comedy             |Action             |Adventure          |Science Fiction|Western|Fantasy|Romance|Documentary|Foreign|Music|TV Movie|Animation         |Drama              |War|Horror|Thriller           |History|Crime|Fami

In [11]:
def recommend_movies_kmeans(movies_df, likedMoviesInput, recommended_movies_weight_matrix, numberOfRecommendation):

    # Assume that the movies input are movies that are liked
    # Train kmean model
    # Get predictions of cluster number of movies input
    # Filter to search for recommended movies
    # Show filter

    # Format df so that each column is a genre
    genres = movies_df.select("genres").rdd.flatMap(lambda x: x[0].split(',')).distinct().collect()
    for genre in genres:
        movies_df = movies_df.withColumn(genre, col("genres").contains(genre).cast("int"))
    for movie in likedMoviesInput:
        movie_input_df = movies_df.filter(col('title') == movie)
        for genre in genres:
            movie_input_df = movie_input_df.withColumn(genre, col(genre))
        movies_df = movies_df.filter(col('title') != movie).union(movie_input_df)

    # Create feature vector
    assembler = VectorAssembler(inputCols=genres, outputCol="features")

    movies_df = assembler.transform(movies_df)

    # Format input movies as df
    user_input_movies_df = movies_df.filter(col('title').isin(likedMoviesInput))

    # Cluster movies using KMeans
    kmeans = KMeans(k=15)
    model = kmeans.fit(movies_df.select("features"))

    # Get cluster labels for all movies
    clustered = model.transform(movies_df)

    # Get cluster labels for user input movies
    user_clustered = model.transform(user_input_movies_df).withColumnRenamed("title", "user_title")
    user_predictions = user_clustered.select('prediction').rdd.flatMap(lambda x: x).collect()

    # Get recommended movies from same clusters as user input movies
    cluster_movies_df = clustered.filter(col('prediction').isin(user_predictions)).filter(~col('title').isin(moviesInput))

    # Get the cluster number for the recommended_movies from weight matrix 
    list_of_recommended_movies = recommended_movies_weight_matrix.select('id').rdd.flatMap(lambda x: x).collect()   
    recommended_movies_df = cluster_movies_df.filter(col('id').isin(list_of_recommended_movies))
    recommended_movies_df.show(truncate=False)
 
    print("Performance metric (accuracy): " + str(recommended_movies_df.count() / numberOfRecommendation))
   

# Input movies
liked_moviesInput =  ["Brother Bear", "Toy Story"]

# Call the recommend_movies function
recommended_movies = recommend_movies_kmeans(movies_df, liked_moviesInput, recommended_movies_weight_matrix, numberOfRecommendation)


+------+--------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+-------------------------------------------------+-------+------+------+---------+---------------+-------+-------+-------+-----------+-------+-----+--------+---------+-----+---+------+--------+-------+-----+------+-----------------------------------------------+----------+
|id    |title                                                                                                                     |original_title                             |genres                                           |Mystery|Comedy|Action|Adventure|Science Fiction|Western|Fantasy|Romance|Documentary|Foreign|Music|TV Movie|Animation|Drama|War|Horror|Thriller|History|Crime|Family|features                                       |prediction|
+------+----------------------------------------------------------------------------------------------