In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession

DATA_DIR = "/data/ml-latest"
# создаём сессию Spark
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

In [2]:
import os
import pyspark.sql.functions as sql_func

# считываем фичи фильмов
tf_idf = spark.read.parquet(os.path.join(DATA_DIR, "tf_idf.parquet")).cache()

In [3]:
from pyspark.sql import functions as sql_func
from pyspark.sql.types import FloatType
from scipy.spatial.distance import euclidean

# определяем функцию расстояния
distance = sql_func.udf(
    lambda x1, x2: euclidean(
        x1.toArray(),
        x2.toArray()
    ), # тут может потребоваться .tolist() для некоторых расстояний,
    returnType = FloatType()
)

In [4]:
# находим матрицу расстояний (лениво)
distance_matrix = (
    tf_idf.alias("one")
    .crossJoin(tf_idf.alias("two"))
    .select(
        "one.movieId",
        "one.title",
        "two.movieId",
        "two.title",
        distance("one.tf_idf", "two.tf_idf").alias("distance")
    )
)

In [5]:
# находим 10 ближайших соседей Гарри Поттера
(
    distance_matrix
    .where(sql_func.col("one.movieId") == 4896)
    .orderBy("distance")
    .select("two.movieId", "two.title", "distance")
    .limit(10)
    .toPandas()
)

Unnamed: 0,movieId,title,distance
0,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,0.0
1,5816,Harry Potter and the Chamber of Secrets (2002),242.873566
2,40815,Harry Potter and the Goblet of Fire (2005),258.558289
3,8368,Harry Potter and the Prisoner of Azkaban (2004),322.434418
4,54001,Harry Potter and the Order of the Phoenix (2007),358.139801
5,69844,Harry Potter and the Half-Blood Prince (2009),358.346069
6,88125,Harry Potter and the Deathly Hallows: Part 2 (...,389.009705
7,58105,"Spiderwick Chronicles, The (2008)",431.428925
8,2193,Willow (1988),433.543945
9,135143,Fantastic Beasts and Where to Find Them (2016),435.349213
