## TODO: add some markdown...

In [1]:
# Import the necessary libraries/packages
import pyspark.pandas as ps

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import  Vectors
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

from pyspark.sql.functions import sum as spark_sum, count as spark_count




In [2]:
# Spark Initialization/Setup

spark = SparkSession.builder \
    .appName("recommenderTest") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.executor.memory", "7g") \
    .config("spark.driver.memory", "7g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.sql.pivotMaxValues", "20000") \
    .config("spark.master", "local[*]") \
    .config("spark.sql.codegen.wholeStage", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("FATAL")


# UDFs Initialization/Setup

# To do: remove this function in some future versions
def pearson_average(v):
    sum_nonzero = sum(v)
    count_nonzero = len([e for e in v if e != 0])
    mean = sum_nonzero / count_nonzero
    v2 = [e - mean if e != 0 else 0 for e in v] # now subtract the mean from each non zero element
    return Vectors.dense(v2)

def co_sym (x, y):
    pearson1 = pearson_average(x)
    pearson2 = pearson_average(y)
    return float(pearson1.dot(pearson2)/(Vectors.norm(pearson1,2)*Vectors.norm(pearson2,2)))

dot_udf = udf(co_sym, DoubleType())
spark.udf.register("dot_udf", dot_udf)

23/03/24 22:06:19 WARN Utils: Your hostname, Martin-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.53 instead (on interface en0)
23/03/24 22:06:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/24 22:06:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<function __main__.co_sym(x, y)>

In [3]:
# Data Loading/Preparation

# File FORMAT: userId, movieId, rating, timestamp
df = spark.read.csv("data/ratings_tiny.csv", header=True, inferSchema=True)
df = df.drop("timestamp")

                                                                                

In [4]:
# Data Modeling

df_user_movie_rating = df # DF to be used for User-Movie-Rating Matrix (will be used later on)

# -- Building Similarity Matrix --
df = df.groupBy("movieId").pivot("userId").agg({"rating": "first"}).fillna(0)
#df = df.sort("movieId")

# Build Vector Columns from DF Matrix
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="features")
df_vector = assembler.transform(df).select('movieId', 'features')
df_vector = df_vector.repartition(10)

# Compute dot product of normalized vectors to fill data into Similarity Matrix
similarity_matrix = df_vector.alias("a").crossJoin(df_vector.alias("b")) \
    .where("a.movieId != b.movieId") \
    .selectExpr("a.movieId as movieId", "b.movieId as movieId_1",
                "dot_udf(a.features, b.features) as similarity")
similarity_matrix.show(10, 10)

# Build User-Movie-Rating Matrix (where for each user, we have all the movies combinations with the similarity values)
df_user_movie_rating = df_user_movie_rating.join(similarity_matrix, df_user_movie_rating.movieId == similarity_matrix.movieId, how='left').drop(similarity_matrix.movieId)
df_user_movie_rating = df_user_movie_rating.withColumnRenamed("similarity.movieId", "movie2")

                                                                                

+-------+---------+----------+
|movieId|movieId_1|similarity|
+-------+---------+----------+
|      4|        3|-0.6239...|
|      4|        6|-0.2353...|
|      4|        5|0.45873...|
|      4|        1|-0.1024...|
|      4|        2|0.46800...|
|      3|        4|-0.6239...|
|      3|        6|0.50636...|
|      3|        5|-0.2842...|
|      3|        1|0.41403...|
|      3|        2|-0.5262...|
+-------+---------+----------+
only showing top 10 rows



In [5]:
# Data Prediction
from pyspark.sql.functions import col, sum
from pyspark.sql.types import FloatType
from pyspark.sql.functions import array, col, sort_array

def predict_user_rating(user_id, movie_id, similarity_matrix):
    # Filter similarity matrix to include only ratings for the given user and similar movies
    user_ratings = similarity_matrix.filter((col("userId") == user_id) & (col("movieId_1") == movie_id))
    # Sort the ratings by similarity in descending order and select the top 2 most similar movies
    user_ratings = user_ratings.sort(col("similarity").desc()).limit(2)
    # Calculate the predicted rating by computing a weighted average of the user's ratings for similar movies
    user_ratings = user_ratings.withColumn("weighted_rating", (col("rating") * col("similarity")).cast(FloatType()))
    numerator = user_ratings.agg(sum("weighted_rating")).collect()[0][0]
    denominator = user_ratings.agg(sum("similarity")).collect()[0][0]
    
    if denominator != 0:
        predicted_rating = numerator / denominator
    else:
        predicted_rating = None
    return predicted_rating


df_user_movie_rating.show(10)

predicted_rating1 = predict_user_rating(user_id=5, movie_id=1, similarity_matrix=df_user_movie_rating)
predicted_rating2 = predict_user_rating(user_id=5, movie_id=3, similarity_matrix=df_user_movie_rating)
print("predicted_rating1", predicted_rating1)
print("predicted_rating2", predicted_rating2)

spark.close() # Close Spark Session

                                                                                

+-------+------+------+---------+--------------------+
|movieId|userId|rating|movieId_1|          similarity|
+-------+------+------+---------+--------------------+
|      1|     1|     1|        2|-0.17854212213729673|
|      1|     1|     1|        5|-0.30895719032666236|
|      1|     1|     1|        6|  0.5870395085642741|
|      1|     1|     1|        3| 0.41403933560541256|
|      1|     1|     1|        4|-0.10245014273309601|
|      1|     3|     3|        2|-0.17854212213729673|
|      1|     3|     3|        5|-0.30895719032666236|
|      1|     3|     3|        6|  0.5870395085642741|
|      1|     3|     3|        3| 0.41403933560541256|
|      1|     3|     3|        4|-0.10245014273309601|
+-------+------+------+---------+--------------------+
only showing top 10 rows



                                                                                

predicted_rating1 2.5864068884261053
predicted_rating2 1.720103231368863


AttributeError: 'SparkSession' object has no attribute 'close'

In [None]:
#  ==============Prediction V1 (OUR FOCUS RIGHT NOW)==============

# Multiply the similarity matrix by the user-movie rating matrix...
# TODO: This is not working... need to figure out why...
# Useful Tutorial: https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

# --------------------------- CODE (BELOW) ---------------------------

# from pyspark.ml.feature import PCA
# from pyspark.sql.functions import sum

# # perform the dot product using PCA
# dot_product_df = PCA(k=1, inputCol="features", outputCol="dot_product").fit(df_vector).transform(similarity_matrix)

# # calculate the weighted average using sum
# weighted_average = similarity_matrix.select(sum(abs(similarity_matrix.columns))).collect()[0][0]

# # divide the dot product by the weighted average
# result = dot_product_df.select("dot_product").rdd.map(lambda x: x[0] / weighted_average).collect()
# result.show()



#-------
## Pivot the similarity matrix
# similarity_matrix = similarity_matrix.groupBy("movieId").pivot("movieId_1").agg({"similarity": "first"}).fillna(0)
# similarity_matrix = similarity_matrix.sort("movieId")
# similarity_matrix.select(similarity_matrix.columns[:10]).show(truncate=False)
# convert columns to vectors
# assembler = VectorAssembler(inputCols=similarity_matrix.columns[1:], outputCol="features")
# similarity_matrix = assembler.transform(similarity_matrix).select('movieId', 'features')
# print('similarity matrix done')

# # convert df2 columns to vectors
# assembler = VectorAssembler(inputCols=df2.columns[1:], outputCol="features")
# df2 = assembler.transform(df2).select('userId', 'features')

# print('starting recommendation matrix')
# # Compute the recommendation matrix
# recommendation_matrix = similarity_matrix.alias("a").crossJoin(df2.alias("b")) \
#     .selectExpr("a.movieId as movieId", "b.userId as userId",
#                 "dot_product_divided_by_sum(a.features, b.features) as recommendation").select("userId", "movieId", "recommendation")

# recommendation_matrix.show(10)