In [1]:
# создаём сессию Spark
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

In [2]:
# файл с оценками - user * item матрица
import os
import pyspark.sql.functions as sql_func

DATA_DIR = "/data/ml-latest"
ratings = (
    spark
    .read
    .csv(
        os.path.join(DATA_DIR, "ratings.csv"),
        header=True,
        inferSchema=True
    )
    .drop("timestamp")
)

In [3]:
popularities = (
    ratings
    .groupBy("movieId")
    .agg(
        sql_func.sum(sql_func.pow(
            sql_func.col("rating"),
            2
        )).alias("sum_of_squares"),
        sql_func.count("userId").alias("watched_one")
    )
    .cache()
)

In [4]:
half_cooccurrences = (
    spark
    .read
    .parquet("/data/other/half_cooccurrences.parquet")
)

In [5]:
# агрегаты для расчёта расстояний - считается минут 15
(
    half_cooccurrences
    .join(
        popularities.alias("pop1"),
        sql_func.col("pop1.movieId") == sql_func.col("movieId1")
    )
    .join(
        popularities.alias("pop2"),
        sql_func.col("pop2.movieId") == sql_func.col("movieId2")
    )
    .select(
        sql_func.col("movieId1"),
        sql_func.col("movieId2"),
        sql_func.col("pop1.watched_one").alias("watched1"),
        sql_func.col("pop2.watched_one").alias("watched2"),
        sql_func.col("pop1.sum_of_squares").alias("sum_of_squares1"),
        sql_func.col("pop2.sum_of_squares").alias("sum_of_squares2"),
        sql_func.col("inner_product"),
        sql_func.col("watched_both")
    )
    .write
    .mode("overwrite")
    .parquet("/data/other/pre_distance_matrix.parquet")
)