In [1]:
# Import the necessary libraries
import pyspark.pandas as ps
#------------------do not touch------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import  Vectors
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

from pyspark.sql.functions import sum as spark_sum, count as spark_count
#------------------------------------------------

# TODO: Figure out why we always need to restart the kernel... probably due to an import...




In [2]:
# Spark Initialization + Configuration of UDFs

#Initialize a spark session
spark = SparkSession.builder \
    .appName("recommenderTest") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.executor.memory", "7g") \
    .config("spark.driver.memory", "7g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.sql.pivotMaxValues", "20000") \
    .config("spark.master", "local[*]") \
    .config("spark.sql.codegen.wholeStage", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

def pearson_average(v):
    sum_nonzero = sum(v)
    count_nonzero = len([e for e in v if e != 0])
    mean = sum_nonzero / count_nonzero
    v2 = [e - mean if e != 0 else 0 for e in v] # now subtract the mean from each non zero element
    return Vectors.dense(v2)

def co_sym (x, y):
    pearson1 = pearson_average(x)
    pearson2 = pearson_average(y)
    return float(pearson1.dot(pearson2)/(Vectors.norm(pearson1,2)*Vectors.norm(pearson2,2)))

dot_udf = udf(co_sym, DoubleType())
spark.udf.register("dot_udf", dot_udf)

23/03/24 03:27:43 WARN Utils: Your hostname, Martin-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.53 instead (on interface en0)
23/03/24 03:27:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/24 03:27:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<function __main__.co_sym(x, y)>

In [3]:
# File Loading

df = spark.read.csv("data/ratings_tiny.csv", header=True, inferSchema=True)
# File FORMAT: userId, movieId, rating, timestamp
df = df.drop("timestamp")

# ==============DF Initialization (to be used later...)==============

df_user_movie_rating = df
df_user_movie_rating.show(10, 10)

# ==============Similarity Matrix==============
# Group by movieId and pivot the userId column
df = df.groupBy("movieId").pivot("userId").agg({"rating": "first"}).fillna(0)

# Assemble the columns into a vector column
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="features")
df_vector = assembler.transform(df).select('movieId', 'features')
df_vector = df_vector.repartition(10)

# Compute the similarity matrix using the dot product of normalized vectors
similarity_matrix = df_vector.alias("a").crossJoin(df_vector.alias("b")) \
    .where("a.movieId != b.movieId") \
    .selectExpr("a.movieId as movieId", "b.movieId as movieId_1",
                "dot_udf(a.features, b.features) as similarity")

#just show first 10 rows and 10 columns
print("similarity_matrix")
similarity_matrix.show(10, 10)

# Pivot the similarity matrix

# similarity_matrix = similarity_matrix.groupBy("movieId").pivot("movieId_1").agg({"similarity": "first"}).fillna(0)
# similarity_matrix.select(similarity_matrix.columns[:10]).show(truncate=False)

+-------+------+------+
|movieId|userId|rating|
+-------+------+------+
|      1|     1|     1|
|      1|     3|     3|
|      1|     6|     5|
|      1|     9|     5|
|      1|    11|     4|
|      2|     3|     5|
|      2|     4|     4|
|      2|     7|     4|
|      2|    10|     2|
|      2|    11|     1|
+-------+------+------+
only showing top 10 rows

similarity_matrix


                                                                                

+-------+---------+----------+
|movieId|movieId_1|similarity|
+-------+---------+----------+
|      4|        3|-0.6239...|
|      4|        6|-0.2353...|
|      4|        5|0.45873...|
|      4|        1|-0.1024...|
|      4|        2|0.46800...|
|      3|        4|-0.6239...|
|      3|        6|0.50636...|
|      3|        5|-0.2842...|
|      3|        1|0.41403...|
|      3|        2|-0.5262...|
+-------+---------+----------+
only showing top 10 rows



In [4]:
# ==============Prediction==============

# predict ratings for user_movie_ratings df
df_user_movie_rating = df_user_movie_rating.join(similarity_matrix, df_user_movie_rating.movieId == similarity_matrix.movieId, how='left').drop(similarity_matrix.movieId)
df_user_movie_rating = df_user_movie_rating.withColumnRenamed("similarity.movieId", "movie2")



#------------------do not touch------------------
# TODO: figure out this... probably the reason why we need to restart kernel.. must be messing up with other imports...
from pyspark.sql.functions import col, sum
from pyspark.sql.types import FloatType
from pyspark.sql.functions import array, col, sort_array
#------------------------------------------------


def predict_user_rating(user_id, movie_id, similarity_matrix):
    # Filter similarity matrix to include only ratings for the given user and similar movies
    user_ratings = similarity_matrix.filter((col("userId") == user_id) & (col("movieId_1") == movie_id))
    # Sort the ratings by similarity in descending order and select the top 2 most similar movies
    user_ratings = user_ratings.sort(col("similarity").desc()).limit(2)
    # Calculate the predicted rating by computing a weighted average of the user's ratings for similar movies
    user_ratings = user_ratings.withColumn("weighted_rating", (col("rating") * col("similarity")).cast(FloatType()))
    numerator = user_ratings.agg(sum("weighted_rating")).collect()[0][0]
    denominator = user_ratings.agg(sum("similarity")).collect()[0][0]
    
    if denominator != 0:
        predicted_rating = numerator / denominator
    else:
        predicted_rating = None
    return predicted_rating


df_user_movie_rating.show(10)

predicted_rating1 = predict_user_rating(user_id=5, movie_id=1, similarity_matrix=df_user_movie_rating)
predicted_rating2 = predict_user_rating(user_id=5, movie_id=3, similarity_matrix=df_user_movie_rating)
print("predicted_rating1", predicted_rating1)
print("predicted_rating2", predicted_rating2)


                                                                                

+-------+------+------+---------+--------------------+
|movieId|userId|rating|movieId_1|          similarity|
+-------+------+------+---------+--------------------+
|      1|     1|     1|        2|-0.17854212213729673|
|      1|     1|     1|        5|-0.30895719032666236|
|      1|     1|     1|        6|  0.5870395085642741|
|      1|     1|     1|        3| 0.41403933560541256|
|      1|     1|     1|        4|-0.10245014273309601|
|      1|     3|     3|        2|-0.17854212213729673|
|      1|     3|     3|        5|-0.30895719032666236|
|      1|     3|     3|        6|  0.5870395085642741|
|      1|     3|     3|        3| 0.41403933560541256|
|      1|     3|     3|        4|-0.10245014273309601|
+-------+------+------+---------+--------------------+
only showing top 10 rows

predicted_rating1 2.5864068884261053
predicted_rating2 1.720103231368863
