Load datasets
-----------------

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.sql.functions import col, udf

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True).cache()
movies_df = movies_df.join(links_df, on = ['movieId']).cache()
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True).cache()
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True).cache()

genresList = ["Crime", "Romance", "Thriller", "Adventure", "Drama", "War", "Documentary", "Fantasy", "Mystery", \
                  "Musical", "Animation", "Film-Noir", "(no genres listed)", "IMAX", "Horror", "Western", \
                  "Comedy", "Children", "Action", "Sci-Fi"]

udf_parse_genres = udf(lambda str: setGenresMatrix(str), ArrayType(IntegerType()))

def setGenresMatrix(genres):
    movieGenresMatrix = []
    movieGenresList = genres.split('|')
    for x in genresList:
        if (x in movieGenresList):
            movieGenresMatrix.append(1)
        else:
            movieGenresMatrix.append(0) 
    return movieGenresMatrix


movies_df = movies_df.withColumn("genresMatrix", udf_parse_genres(col("genres")))
movies_df.show()

+-------+--------------------+--------------------+------+------+--------------------+
|movieId|               title|              genres|imdbId|tmdbId|        genresMatrix|
+-------+--------------------+--------------------+------+------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|[0, 0, 0, 1, 0, 0...|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|[0, 0, 0, 1, 0, 0...|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|[0, 1, 0, 0, 0, 0...|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|[0, 1, 0, 0, 1, 0...|
|      5|Father of the Bri...|              Comedy|113041| 11862|[0, 0, 0, 0, 0, 0...|
|      6|         Heat (1995)|Action|Crime|Thri...|113277|   949|[1, 0, 1, 0, 0, 0...|
|      7|      Sabrina (1995)|      Comedy|Romance|114319| 11860|[0, 1, 0, 0, 0, 0...|
|      8| Tom and Huck (1995)|  Adventure|Children|112302| 45325|[0, 0, 0, 1, 0, 0...|
|      9| Sudden Death (1995)|             

Compute the item feature vector
------

In [2]:
from pyspark.sql.functions import log10
from pyspark.sql.functions import col
import math

tf = tags_df.groupBy(["movieId", "tag"]).count().selectExpr("movieId", "tag","count AS tag_count_tf")

tags_distinct = tags_df.selectExpr("movieId", "tag").dropDuplicates()
df = tags_distinct.groupBy("tag").count().selectExpr("tag", "count AS tag_count_df")
idf = math.log10(tags_df.select("movieId").distinct().count())
df = df.withColumn("idf", idf - log10("tag_count_df"))

tf = tf.join(df, on = "tag", how = "left")
tf = tf.withColumn("tf-idf", col("tag_count_tf") * col("idf"))
# show TF-IDF values for each movie
# tf.select("movieId", "tag", "tf-idf").show()

Calculate unit length vector of TF-IDF for normalization
------

In [3]:
from pyspark.sql.functions import col
from pyspark.sql.functions import sqrt

vect_len = tf.select("movieId","tf-idf")
vect_len = vect_len.withColumn("tf-idf-sq", col("tf-idf")**2)
vect_len = vect_len.groupby("movieId").sum().withColumnRenamed("sum(tf-idf)", "tf-idf-sum")\
    .withColumnRenamed("sum(tf-idf-sq)", "tf-idf-sq-sum")
vect_len = vect_len.withColumn("vect_length", sqrt("tf-idf-sq-sum"))
tf = tf.join(vect_len,on = "movieId", how = "left")
tf = tf.withColumn("tag_vec", col("tf-idf")/col("vect_length"))

# display the feature unit length vector of each movie: 'tag_vec'
# tf.filter(tf["movieId"] == 60756).select("movieId","tag","tf-idf","vect_length", "tag_vec").show()

Let’s implement the same and calculate user profile for each user.

In [4]:
from pyspark.sql.functions import lit

ratings_filter = ratings_df.filter(ratings_df["rating"] > 3)

#enter user ID for analysis
userId = 65
user_data= ratings_filter.filter(ratings_filter["userId"] == userId)
user_data = tf.join(user_data, on = "movieId", how = "inner")

user_tag_pref = user_data.groupby("tag").sum().withColumnRenamed("sum(tag_vec)", "tag_pref")\
    .select("tag","tag_pref")
user_tag_pref = user_tag_pref.withColumn("user", lit(userId))
user_tag_pref.filter(user_tag_pref["tag"] == "Boxing story").show()

+------------+------------------+----+
|         tag|          tag_pref|user|
+------------+------------------+----+
|Boxing story|0.5954367951274172|  65|
+------------+------------------+----+



Step 4. Compute the cosine similarities and predict item ratings
--------

In [8]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import math

movieId = 123

tf_movies = tf.filter(tf["movieId"] == movieId)
print(tf.count())

tag_merge = tf_movies.join(user_tag_pref, on = "tag", how = "left")
tag_merge.fillna({"tag_pref": 0})
tag_merge.withColumn("tag_value", col("tag_vec") * col("tag_pref"))


tag_merge.show()
tag_merge.agg(F.sum("tag_vec")).show()

# tag_vec_val = math.sqrt(tag_merge.agg(F.sum("tag_vec")))
# print("Movie id {} tag_vec {}".format(movieId[0], tag_vec_val))
                       
# tag_vec_val = np.sqrt(np.sum(np.square(tag_merge['tag_vec']), axis=0))
# tag_pref_val = np.sqrt(np.sum(np.square(user_tag_pref_all['tag_pref']), axis=0))
        
# tag_merge_final = tag_merge.groupby(['user','movieId'])[['tag_value']]\
#                                    .sum()\
#                                    .rename(columns = {'tag_value': 'rating'})\
#                                    .reset_index()
        
# tag_merge_final['rating']=tag_merge_final['rating']/(tag_vec_val*tag_pref_val)
        
# tag_merge_all = tag_merge_all.append(tag_merge_final, ignore_index=True)

3579
+---+-------+------------+------------+---+------+------------+----------+-------------+-----------+-------+--------+----+
|tag|movieId|tag_count_tf|tag_count_df|idf|tf-idf|sum(movieId)|tf-idf-sum|tf-idf-sq-sum|vect_length|tag_vec|tag_pref|user|
+---+-------+------------+------------+---+------+------------+----------+-------------+-----------+-------+--------+----+
+---+-------+------------+------------+---+------+------------+----------+-------------+-----------+-------+--------+----+

+------------+
|sum(tag_vec)|
+------------+
|        null|
+------------+



References
--------------

* [Content Based Recommender System in Python](https://medium.com/@tomar.ankur287/content-based-recommender-system-in-python-2e8e94b16b9e)

* [Data Science Series: Content-based Recommender System using Azure Databricks](https://visualbi.com/blogs/business-intelligence/data-science/data-science-series-content-based-recommender-system-using-azure-databricks/)