Load datasets
-----------------

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.sql.functions import col, count, mean, udf, UserDefinedFunction
import re

genresList = ["Crime", "Romance", "Thriller", "Adventure", "Drama", "War", "Documentary", "Fantasy", "Mystery", \
                  "Musical", "Animation", "Film-Noir", "(no genres listed)", "IMAX", "Horror", "Western", \
                  "Comedy", "Children", "Action", "Sci-Fi"]

spark = SparkSession.builder.appName("Recommendation ALS").config("spark.executor.memory", "3g")\
    .config("spark.driver.cores", "4").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True).cache()
movies_df = movies_df.join(links_df, on = ['movieId']).cache()
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True).cache()
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True).cache()

def setGenresMatrix(genres):
    movieGenresMatrix = []
    movieGenresList = genres.split('|')
    for x in genresList:
        if (x in movieGenresList):
            movieGenresMatrix.append(1)
        else:
            movieGenresMatrix.append(0) 
    return movieGenresMatrix

udf_parse_genres = UserDefinedFunction(lambda str: setGenresMatrix(str), ArrayType(IntegerType()))


movies_df = movies_df.withColumn("genresMatrix", udf_parse_genres(col("genres")))
ratings_df = ratings_df.groupBy("movieId").agg(mean("rating").alias("mean_rating"), count("rating")\
                                               .alias("count_rating"))
movies_df = movies_df.join(ratings_df, on = "movieId").select("movieId", "title", "genres", "genresMatrix", \
                                                              "mean_rating", "count_rating")

def getYear(title):
    result = re.search(r'\(\d{4}\)', title)
    if result:
        found = result.group(0).strip('(').strip(')')
    else: 
        found = 0
    return int(found)

udf_parse_year = udf(lambda str: getYear(str), IntegerType())
movies_df = movies_df.withColumn("year", udf_parse_year(col("title")))

from pyspark.sql.functions import regexp_replace
movies_df = movies_df.withColumn("title", regexp_replace("title", "\(\d{4}\)", ""))

In [2]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import concat, collect_set, udf, when
from pyspark.sql.types import ArrayType, StringType
from functools import reduce

# Tokenize text
tokenizer = Tokenizer(inputCol='tag', outputCol='tags_token')
df_words_token = tokenizer.transform(tags_df).select('movieId', 'tags_token')

# Remove stop words
remover = StopWordsRemover(inputCol='tags_token', outputCol='tags_clean')
df_words_no_stopw = remover.transform(df_words_token).select("movieId", "tags_clean")

def fudf(val):
    return reduce (lambda x, y:x+y, val)

flattenUdf = udf(fudf, ArrayType(StringType()))

df_words_no_stopw = df_words_no_stopw.groupBy("movieId").agg(collect_set("tags_clean")).select("movieId", \
                        flattenUdf("collect_set(tags_clean)").alias("tags_clean"))

movies_df = movies_df.join(df_words_no_stopw, on="movieId", how="left").cache()

Compute the item feature vector
------

In [3]:
genresSimilarityWeight = 0.8
tagsSimilarityWeight = 2
titleSimilarityWeight = 2
yearDistanceWeight = 0.05
ratingAvgWeight = 0.2

Check similarity of movies
------

Step 4. Compute the cosine similarities and predict item ratings
--------

In [4]:
movieId = 6377

basisGenres = movies_df.filter(movies_df['movieId'] == movieId).select("genresMatrix").collect()[0][0]
basisYear = movies_df.filter(movies_df['movieId'] == movieId).select('year').collect()[0][0]
basisRatingAvg = movies_df.filter(movies_df['movieId'] == movieId).select('mean_rating').collect()[0][0]

from scipy.spatial.distance import cosine
from pyspark.sql.functions import col, udf, abs, lit
from pyspark.sql.types import DoubleType, FloatType

def consineFunc(genresVal):
    return float(cosine(basisGenres, genresVal))

consineUdf = udf(consineFunc, FloatType())

tagsPandaDf = df_words_no_stopw.toPandas()
tagsDict = {}
for index, x in tagsPandaDf.iterrows():
    wordlist = x['tags_clean']
    tempMovieId = x['movieId']
    for y in wordlist:
        if tempMovieId in tagsDict:
            # if y not in tagsDict[movieId]:  # Switched off (we will get a non unique list)
            tagsDict[tempMovieId].append(y)
        else:
            tagsDict[tempMovieId] = [y]

titleWordsDict = {}
titlePandaDf = movies_df.toPandas()

for index, x in titlePandaDf.iterrows():
    wordlist = str(x['title']).lower().split(' ')
    tempMovieId = x['movieId']
    for y in wordlist:
        if tempMovieId in titleWordsDict:
            titleWordsDict[tempMovieId].append(y)
        else:
            titleWordsDict[tempMovieId] = [y]                
            
def tagsSimilarityFunc(basisMovieID, checkedMovieID, checkType): 
    if checkType == 'tag':
        dictToCheck = tagsDict
    else:
        dictToCheck = titleWordsDict
    counter = 0.0001
    if basisMovieID in dictToCheck: 
        basisTags = dictToCheck[basisMovieID]
        countAllTags = len(basisTags)
        basisTagsDict = {}
        for x in basisTags:
            if x in basisTagsDict:
                basisTagsDict[x] += 1
            else:
                basisTagsDict[x] = 1   
        
        for x in basisTagsDict:
            basisTagsDict[x] = basisTagsDict[x] / countAllTags
    else: return 0.0001
    
    if checkedMovieID in dictToCheck: 
        checkedTags = dictToCheck[checkedMovieID]
        checkedTags = set(checkedTags) # Make the list unique
        checkedTags = list(checkedTags)
        
    else: return 0.0001
    
    for x in basisTagsDict:
        if x in checkedTags: counter += basisTagsDict[x]
    print("Counter {}".format(counter))        
    return counter    

tagsSimilarityUdf = udf(tagsSimilarityFunc, FloatType())

moviesWithSim = movies_df.withColumn("similarity", consineUdf("genresMatrix") * genresSimilarityWeight + \
                                             abs(basisYear - col("year")) / 100 * yearDistanceWeight + \
                                             - tagsSimilarityUdf(lit(int(movieId)), col("movieId"), lit("tag")) * tagsSimilarityWeight + \
                                             - tagsSimilarityUdf(lit(int(movieId)), col("movieId"), lit("title")) * titleSimilarityWeight)

moviesWithSim.sort("similarity", ascending = True).filter(moviesWithSim["movieId"] != movieId).\
    select("movieId", "title", "similarity", "genres", "year").show(100)

+-------+--------------------+-------------------+--------------------+----+
|movieId|               title|         similarity|              genres|year|
+-------+--------------------+-------------------+--------------------+----+
|   8961|   Incredibles, The |-1.2487750704288483|Action|Adventure|...|2004|
|   3114|        Toy Story 2 |-1.2472750704288482|Adventure|Animati...|1999|
|    588|            Aladdin |-1.2437750704288484|Adventure|Animati...|1992|
| 157296|       Finding Dory |-1.2200536095568677|Adventure|Animati...|2016|
|   2085|101 Dalmatians (O...|-1.2055536427497864|Adventure|Animati...|1961|
|   1030|      Pete's Dragon | -1.120733320236206|Adventure|Animati...|1977|
|   2078|   Jungle Book, The | -1.115733320236206|Animation|Childre...|1967|
|   2080| Lady and the Tramp | -1.109733320236206|Animation|Childre...|1955|
| 117887|         Paddington |-1.0939187355041504|     Children|Comedy|2014|
|    616|    Aristocats, The |-1.0829187355041503|  Animation|Children|1970|

References
--------------

* [Content Based Recommender System in Python](https://medium.com/@tomar.ankur287/content-based-recommender-system-in-python-2e8e94b16b9e)

* [Data Science Series: Content-based Recommender System using Azure Databricks](https://visualbi.com/blogs/business-intelligence/data-science/data-science-series-content-based-recommender-system-using-azure-databricks/)

* [Movie Recommendation Algorithm](https://www.kaggle.com/bakostamas/movie-recommendation-algorithm)