In [10]:
import os
import sys
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import DenseVector, Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.feature import PCA
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col, when, count, sum as pyspark_sum

#Initialize a spark session
spark = SparkSession.builder \
    .appName("recommenderTest") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.executor.memory", "7g") \
    .config("spark.driver.memory", "7g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.sql.pivotMaxValues", "20000") \
    .config("spark.master", "local[*]") \
    .getOrCreate()
# --- Data Pre-Processing ---
def pearson_average(v):
    """
    Computes the Pearson average of a PySpark Vector.
    Returns a new Vector with the Pearson average.
    """
    #divide the sum of the vector by the length of non-zero elements
    sum_nonzero = sum(v)
    count_nonzero = len([e for e in v if e != 0])
    mean = sum_nonzero / count_nonzero
    # now subtract the mean from each non zero element
    v2 = [e - mean if e != 0 else 0 for e in v]
    #convert to dense vector
    return Vectors.dense(v2)

def co_sym (x, y):
    pearson1 = pearson_average(x)
    pearson2 = pearson_average(y)
    return float(pearson1.dot(pearson2)/(Vectors.norm(pearson1,2)*Vectors.norm(pearson2,2)))

dot_udf = udf(co_sym, DoubleType())
# dot_udf = udf(lambda x, y: float(x.dot(y)/(Vectors.norm(x,2)*Vectors.norm(y,2))), DoubleType())
spark.udf.register("dot_udf", dot_udf)

# Load the data into a DataFrame (userId, movieId, rating, timestamp)
# File format: userId, movieId, rating, timestamp
df = spark.read.csv("data/ratings_tiny.csv", header=True, inferSchema=True)
df = df.drop("timestamp") # drop timestamp column

# Group by movieId and pivot the userId column
df = df.groupBy("movieId").pivot("userId").agg({"rating": "first"}).fillna(0)


# Assemble the columns into a vector column
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="features")
df_vector = assembler.transform(df).select('movieId', 'features')
df_vector = df_vector.repartition(10)

# Compute the similarity matrix using the dot product of normalized vectors
similarity_matrix = df_vector.alias("a").crossJoin(df_vector.alias("b")) \
    .where("a.movieId != b.movieId") \
    .selectExpr("a.movieId as movieId", "b.movieId as movieId_1",
                "dot_udf(a.features, b.features) as similarity")

#just show first 10 rows and 10 columns
print("similarity_matrix")
similarity_matrix.show(10, 10)


# pivot the similarity matrix
similarity_matrix = similarity_matrix.groupBy("movieId").pivot("movieId_1").agg({"similarity": "first"}).fillna(0)
similarity_matrix.select(similarity_matrix.columns[:10]).show(10)

23/03/24 02:18:40 WARN SimpleFunctionRegistry: The function dot_udf replaced a previously registered function.
similarity_matrix
+-------+---------+----------+
|movieId|movieId_1|similarity|
+-------+---------+----------+
|      4|        3|-0.6239...|
|      4|        6|-0.2353...|
|      4|        5|0.45873...|
|      4|        1|-0.1024...|
|      4|        2|0.46800...|
|      3|        4|-0.6239...|
|      3|        6|0.50636...|
|      3|        5|-0.2842...|
|      3|        1|0.41403...|
|      3|        2|-0.5262...|
+-------+---------+----------+
only showing top 10 rows

+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|movieId|                   1|                   2|                  3|                   4|                   5|                   6|
+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|     