In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
import sys

In [45]:
def computeCosineSimilarity(spark, data):
    # Compute xx, xy and yy columns
    pairScores = data \
        .withColumn("xx", func.col("rating1") * func.col("rating1")) \
        .withColumn("yy", func.col("rating2") * func.col("rating2")) \
        .withColumn("xy", func.col("rating1") * func.col("rating2")) \
    
    # Compute numerator, denominator and numPairs columns
    calculateSimilarity = pairScores \
        .groupby("movie1", "movie2") \
            .agg( \
                func.sum(func.col("xy")).alias("numerator"), \
                    (func.sqrt(func.sum(func.col("xx"))) * func.sqrt(func.sum(func.col("yy")))).alias("denominator"), \
                        func.count(func.col("xy")).alias("numPairs")  
            )
    
    # Calculate score and select only needed columns (movie1, movie2, score, numPairs)
    result = calculateSimilarity \
        .withColumn("score", \
            func.when(func.col("denominator") != 0, func.col("numerator") / func.col("denominator")) 
            .otherwise(0) \
            ).select("movie1", "movie2", "score", "numPairs")
    
    return result

In [3]:
# Get movie name by given movie id
def getMovieName(movieNames, movieId):
    result = movieNames.filter(func.col("movieID") == movieId) \
        .select("movieTitle").collect()[0]
    
    return result[0]

In [4]:
spark = SparkSession.builder.appName("MovieSimilarities").master("local[*]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/13 13:58:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
movieNamesSchema = StructType([
    StructField("movieID", IntegerType(), True), \
    StructField("movieTitle", StringType(), True) \
])

moviesSchema = StructType([
    StructField("userID", IntegerType(), True), \
    StructField("movieID", IntegerType(), True), \
    StructField("rating", IntegerType(), True), \
    StructField("timestamp", LongType(), True)
])

In [11]:
# Creating a broadcast dataset of movieID and movieTitle
moviesNames = spark.read.option("sep", "|").option("charset", "ISO-8859-1").schema(movieNamesSchema).csv("resources/ml-100k/u.item")

# Load up movie data as dataset
movies = spark.read.option("sep", "\t").schema(moviesSchema).csv("resources/ml-100k/u.data")


In [14]:
ratings = movies.select("userId", "movieId", "rating")

In [40]:
# Emitting every movie rated together by the same user.
# Self-join to find every combination.
# Select movie pairs and rating pairs
moviePairs = ratings.alias("ratings1") \
    .join(ratings.alias("ratings2"), 
    (func.col("ratings1.userId") == func.col("ratings2.userId")) &
    (func.col("ratings1.movieId") < func.col("ratings2.movieId"))
    ) \
        .select(func.col("ratings1.movieId").alias("movie1"), \
            func.col("ratings2.movieId").alias("movie2"), \
            func.col("ratings1.rating").alias("rating1"), \
            func.col("ratings2.rating").alias("rating2"))
    

In [39]:
# import pandas as pd
# df = pd.read_csv("resources/ml-100k/u.data", sep="\t", header=0, names=["userID", "movieID", "rating", "timestamp"])
# df = df.merge(df, on="userID", how="inner")
# df[df["movieID_x"] < df["movieID_y"]]

In [46]:
moviePairSimilarities = computeCosineSimilarity(spark, moviePairs).cache()

In [69]:
movieID = 181
if (len(sys.argv) > 1):
    scoreThreshold = 0.97
    coOccurrenceThreshold = 50.0

    # Filter for movies with this sim that are "good" as defined by our quality thresholds above
    filteredResults = moviePairSimilarities.filter(\
        ((func.col("movie1") == movieID) | (func.col("movie2") == movieID)) & \
            (func.col("score") > scoreThreshold) & (func.col("numPairs") > coOccurrenceThreshold)
        )
    # Sort by quality score
    results = filteredResults.sort(func.col("score").desc()).take(10)

    print("Top 10 Similar movie for " + getMovieName(moviesNames, movieID))

    for result in results:
        # Display the similarity result that isn't the movie we're looking at
        similarMovieID = result.movie1
        if (similarMovieID == movieID):
            similarMovieID = result.movie2

        print(getMovieName(moviesNames, similarMovieID) + "\tscore: "\
            + str(result.score) + "\tstrength: " + str(result.numPairs))

                                                                                

Top 10 Similar movie for Return of the Jedi (1983)
Empire Strikes Back, The (1980)	score: 0.9872106138566691	strength: 317
Star Wars (1977)	score: 0.9857230861253026	strength: 480
Raiders of the Lost Ark (1981)	score: 0.9756955197948429	strength: 342
Wallace & Gromit: The Best of Aardman Animation (1996)	score: 0.9746282945069386	strength: 58
Glory (1989)	score: 0.9734820301402775	strength: 145
Some Kind of Wonderful (1987)	score: 0.9718583446658893	strength: 51
Indiana Jones and the Last Crusade (1989)	score: 0.9718232061180238	strength: 283
