In [34]:
import os
import sys
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import DenseVector, Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.feature import PCA
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col, when, count, sum as pyspark_sum

#Initialize a spark session
spark = SparkSession.builder \
    .appName("recommenderTest") \
    .config("spark.some.config.option", "some-value") \
    .config("spark.executor.memory", "7g") \
    .config("spark.driver.memory", "7g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.sql.pivotMaxValues", "20000") \
    .config("spark.master", "local[*]") \
    .getOrCreate()
# --- Data Pre-Processing ---
def pearson_average(v):
    """
    Computes the Pearson average of a PySpark Vector.
    Returns a new Vector with the Pearson average.
    """
    #divide the sum of the vector by the length of non-zero elements
    sum_nonzero = sum(v)
    count_nonzero = len([e for e in v if e != 0])
    mean = sum_nonzero / count_nonzero
    # now subtract the mean from each non zero element
    v2 = [e - mean if e != 0 else 0 for e in v]
    #convert to dense vector
    return Vectors.dense(v2)

def co_sym (x, y):
    pearson1 = x
    pearson2 = y
    return float(pearson1.dot(pearson2)/(Vectors.norm(pearson1,2)*Vectors.norm(pearson2,2)))

dot_udf = udf(co_sym, DoubleType())
# dot_udf = udf(lambda x, y: float(x.dot(y)/(Vectors.norm(x,2)*Vectors.norm(y,2))), DoubleType())
spark.udf.register("dot_udf", dot_udf)
# Load the data into a DataFrame (userId, movieId, rating, timestamp)
# File format: userId, movieId, rating, timestamp
df = spark.read.csv("data/ratings_tiny.csv", header=True, inferSchema=True)
df = df.drop("timestamp") # drop timestamp column
# Group by movieId and pivot the userId column
df = df.groupBy("movieId").pivot("userId").agg({"rating": "first"}).fillna(0)
# sort rows by movieId
df = df.sort("movieId")
# df.show()
df.select(df.columns[:10]).show(10)
# Transpose the DataFrame to switch rows and columns
df = df.toPandas().set_index("movieId").transpose().reset_index()
# Convert pandas DataFrame back to Spark DataFrame
df = spark.createDataFrame(df)
cols = df.columns[1:]
#print(cols)
# Assemble the columns into a vector column
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="features")
df_vector = assembler.transform(df).select('index', 'features')
# df_vector.show(truncate=False)
# Compute the similarity matrix using the dot product of normalized vectors
similarity_matrix = df_vector.alias("a").crossJoin(df_vector.alias("b")) \
    .where("a.index != b.index") \
    .selectExpr("a.index as movieId", "b.index as movieId_1",
                "dot_udf(a.features, b.features) as similarity")
#output the similarity matrix
similarity_matrix.show(10)
# pivot the similarity matrix
similarity_matrix = similarity_matrix.groupBy("movieId").pivot("movieId_1").agg({"similarity": "first"}).fillna(0)
similarity_matrix.select(similarity_matrix.columns[:10]).show(10)

23/03/23 00:04:06 WARN SimpleFunctionRegistry: The function dot_udf replaced a previously registered function.
+-------+---+---+---+---+---+---+
|movieId|  1|  2|  3|  4|  5|  6|
+-------+---+---+---+---+---+---+
|      1|  1|  0|  2|  0|  0|  1|
|      2|  0|  0|  4|  2|  0|  0|
|      3|  3|  5|  0|  4|  4|  3|
|      4|  0|  4|  1|  0|  3|  0|
|      5|  0|  0|  2|  5|  4|  3|
|      6|  5|  0|  0|  0|  2|  0|
|      7|  0|  4|  3|  0|  0|  0|
|      8|  0|  0|  0|  4|  0|  2|
|      9|  5|  0|  4|  0|  0|  0|
|     10|  0|  2|  3|  0|  0|  0|
+-------+---+---+---+---+---+---+
only showing top 10 rows



  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


+-------+---------+--------------------+
|movieId|movieId_1|          similarity|
+-------+---------+--------------------+
|      1|        2|-0.17854212213729673|
|      1|        3| 0.41403933560541256|
|      1|        4|-0.10245014273309601|
|      1|        5|-0.30895719032666236|
|      1|        6|  0.5870395085642741|
|      2|        1|-0.17854212213729673|
|      2|        3| -0.5262348115842176|
|      2|        4| 0.46800784077976626|
|      2|        5| 0.39891071573694176|
|      2|        6| -0.3064397582621859|
+-------+---------+--------------------+
only showing top 10 rows





+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|movieId|                   1|                   2|                  3|                   4|                   5|                   6|
+-------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|      6|  0.5870395085642741| -0.3064397582621859| 0.5063696835418333| -0.2353393621658208|-0.21591675854376524|                 0.0|
|      1|                 0.0|-0.17854212213729673|0.41403933560541256|-0.10245014273309601|-0.30895719032666236|  0.5870395085642741|
|      3| 0.41403933560541256| -0.5262348115842176|                0.0| -0.6239806502223061| -0.2842676218074806|  0.5063696835418333|
|      2|-0.17854212213729673|                 0.0|-0.5262348115842176| 0.46800784077976626| 0.39891071573694176| -0.3064397582621859|
|      4|-0.10245014273309601| 0.46800784077976626|-0.6

                                                                                