Load datasets
------

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True)
movies_df = movies_df.join(links_df, on = ['movieId'])
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True)
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True)

movies_df.show()

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|
|      9| Sudden Death (1995)|              Action|114576|  9091|
|     10|    GoldenEye (1995)|Action|Adventure|...|113189|   710|
|     11|American Presiden...|Comedy|Drama|Romance|112346|  9087|
|     12|Dracula: Dead and...|       Comedy|Horror|112896| 12110|
|     13| 

Basic Recommender System
----

In [17]:
from pyspark.sql.functions import mean
from pyspark.sql.functions import col
from pyspark.sql.functions import count

group_movies = ratings_df.groupBy("movieId").agg(mean("rating").alias("average_rating"), \
                                                 count("rating").alias("count_rating"))
C = group_movies.agg(mean("average_rating")).collect()[0][0]
group_movies = group_movies.filter(group_movies["count_rating"] > 10)

group_movies = group_movies.withColumn("weighted_average", col("count_rating")*col("average_rating") \
                                       / (col("count_rating") + 10) + 10*C / (col("count_rating") + 10))
group_movies = group_movies.select("movieId", "weighted_average").orderBy("weighted_average", \
                                                                          ascending=False).limit(10)
trend_movies = group_movies.join(movies_df, on="movieId").select("movieId", "title", col("weighted_average")\
                                                                 .alias("weight"))
trend_movies_list = [list(row) for row in trend_movies.collect()]
for movie in trend_movies_list:
    print("Movie: {}".format(movie[0]))

Movie: 50
Movie: 260
Movie: 318
Movie: 527
Movie: 750
Movie: 858
Movie: 1213
Movie: 1221
Movie: 2959
Movie: 58559


Trend movies for genres
---------------

In [33]:
from pyspark.sql.functions import arrays_zip, col, explode, udf
from pyspark.sql.types import ArrayType, StringType

def parse_genres(str):
    return str.split("|")

udf_parse_genres = udf(lambda str: parse_genres(str), ArrayType(StringType()))
new_movies_df = movies_df.select("movieId", "title", udf_parse_genres("genres").alias("genre"))
new_movies_df = new_movies_df.withColumn("genre", explode("genre"))

genres = ["Crime", "Romance", "Thriller", "Adventure", "Drama", "War", "Documentary", "Fantasy", "Mystery", \
         "Musical", "Animation", "Film-Noir", "(no genres listed)", "IMAX", "Horror", "Western", \
         "Comedy", "Children", "Action", "Sci-Fi"]

genre = "Crime"
genre_movies_df = new_movies_df.filter(new_movies_df["genre"] == genre).select("movieId", "title")

genre_ratings_df = ratings_df.join(genre_movies_df, on = "movieId", how = "inner")

genre_group_movies = genre_ratings_df.groupBy("movieId").agg(mean("rating").alias("average_rating"), \
                                                 count("rating").alias("count_rating"))
genre_C = genre_group_movies.agg(mean("average_rating")).collect()[0][0]
genre_group_movies = genre_group_movies.filter(genre_group_movies["count_rating"] > 10)

genre_group_movies = genre_group_movies.withColumn("weighted_average", col("count_rating")*col("average_rating") \
                                       / (col("count_rating") + 10) + 10*C / (col("count_rating") + 10))
genre_group_movies = genre_group_movies.select("movieId", "weighted_average").\
    orderBy("weighted_average", ascending=False).limit(10)
genre_trend_movies = genre_group_movies.join(genre_movies_df, on = "movieId", how = "inner").\
    select("movieId", "title", "weighted_average")
genre_trend_movies.show()

+-------+--------------------+------------------+
|movieId|               title|  weighted_average|
+-------+--------------------+------------------+
|     50|Usual Suspects, T...| 4.192170480131353|
|    296| Pulp Fiction (1994)|  4.16758511907921|
|    318|Shawshank Redempt...| 4.393347042043149|
|    858|Godfather, The (1...|4.2382400136045035|
|   1213|   Goodfellas (1990)|  4.17738590255963|
|   1221|Godfather: Part I...| 4.187945919051148|
|   2329|American History ...| 4.148377573727408|
|   2959|   Fight Club (1999)|  4.22861615240399|
|  48516|Departed, The (2006)|4.1677306217787145|
|  58559|Dark Knight, The ...| 4.176883539296286|
+-------+--------------------+------------------+



References
-----

[Beginner’s Recommendation Systems with Python](https://towardsdatascience.com/beginners-recommendation-systems-with-python-ee1b08d2efb6)