In [1]:
from pyspark.sql import SparkSession

# создаём сессию Spark
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

In [2]:
import os

# читаем подготовленный файл Last.fm
ratings = spark.read.parquet("/data/other/user_item_lastfm.parquet")

In [3]:
ratings.show()

+-------+---------+------------------+
|user_id|artist_id|             plays|
+-------+---------+------------------+
|  13879|    49964|1.3862943611198906|
|  88820|    49964|3.4657359027997265|
| 251853|    49964|2.4849066497880004|
| 259287|    49964|1.3862943611198906|
| 283490|    49964|3.8066624897703196|
| 290219|    49964|1.3862943611198906|
| 291686|    49964|1.0986122886681098|
| 309162|    49964| 4.543294782270004|
| 313962|    49964| 5.332718793265369|
| 334113|    49964| 3.258096538021482|
| 338759|    49964|0.6931471805599453|
| 304832|    50207|4.6913478822291435|
|  49685|    50219|3.4339872044851463|
|  99265|    49899| 6.932447891572509|
| 121893|    49899| 4.672828834461906|
| 187072|    49899|3.9889840465642745|
|  14208|    50130|1.6094379124341003|
|  91195|    50130|  4.02535169073515|
| 177518|    50130|5.3471075307174685|
| 198494|    50130| 3.044522437723423|
+-------+---------+------------------+
only showing top 20 rows



In [4]:
# разбиваем на train и test
train, test = ratings.randomSplit([0.8, 0.2], seed=42)
train.cache()
test.cache()
train.count()

13867106

In [5]:
%%time
from pyspark.ml.recommendation import ALS

# обучаем модель с базовыми настройками
als = ALS(
    userCol="user_id",
    itemCol="artist_id",
    ratingCol="plays",
    coldStartStrategy="drop",
    seed=42
).fit(train)

CPU times: user 50 ms, sys: 0 ns, total: 50 ms
Wall time: 1min 6s


In [6]:
%%time
from pyspark.ml.evaluation import RegressionEvaluator

# оцениваем RMSE
predictions = als.transform(test)
rmse = evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="plays",
    predictionCol="prediction"
).evaluate(predictions)
print("RMSE на тестовой выборке:", rmse)

RMSE на тестовой выборке: 0.676012035587525
CPU times: user 50 ms, sys: 20 ms, total: 70 ms
Wall time: 34 s


In [8]:
%%time

# получаем рекомендации по пользователям из тестовой выборки
explicit_filename = "/data/other/lastfm_explicit_top50.parquet"
als.recommendForUserSubset(
    dataset=test.select("user_id").distinct(),
    numItems=50
).write.mode("overwrite").parquet(explicit_filename)

CPU times: user 390 ms, sys: 240 ms, total: 630 ms
Wall time: 36min 52s


In [9]:
explicit_recs = (
    spark
    .read
    .parquet(explicit_filename)
    .cache()
)

In [10]:
# в рекомендациях - топ 50 подходящих пользователю исполнителей
explicit_recs.first()

Row(user_id=158, recommendations=[Row(artist_id=131459, rating=5.145086288452148), Row(artist_id=135146, rating=4.845714092254639), Row(artist_id=63139, rating=4.6202874183654785), Row(artist_id=112799, rating=4.481029987335205), Row(artist_id=90171, rating=4.4052205085754395), Row(artist_id=32538, rating=4.351135730743408), Row(artist_id=116761, rating=4.325567245483398), Row(artist_id=34012, rating=4.317909240722656), Row(artist_id=25635, rating=4.248568058013916), Row(artist_id=96604, rating=4.2086567878723145), Row(artist_id=122820, rating=4.164421558380127), Row(artist_id=119201, rating=4.153027534484863), Row(artist_id=95128, rating=4.1034040451049805), Row(artist_id=95770, rating=4.049783229827881), Row(artist_id=11097, rating=4.03601598739624), Row(artist_id=77251, rating=4.027595043182373), Row(artist_id=78238, rating=4.007275581359863), Row(artist_id=27530, rating=3.9764175415039062), Row(artist_id=37412, rating=3.9198319911956787), Row(artist_id=33897, rating=3.9112632274627

In [11]:
import pyspark.sql.functions as sql_func
from pyspark.mllib.evaluation import RankingMetrics

# используем стандартную Spark функцию для оценки точности
def get_precision(recs, k):
    predictions = (
        recs
        .select(
            "user_id",
            sql_func.explode("recommendations").alias("recommendation")
        )
        .select(
            "user_id",
            sql_func.col("recommendation").getItem("artist_id").alias("item_id")
        )
        .groupBy("user_id")
        .agg(sql_func.collect_list("item_id").alias("prediction"))
    )
    predictions_and_labels = (
        test
        .groupBy("user_id")
        .agg(sql_func.collect_list("artist_id").alias("label"))
        .join(predictions, "user_id")
        .select("prediction", "label")
        .rdd
    )
    return RankingMetrics(predictions_and_labels).precisionAt(k)

In [12]:
# точность получается ниже плинтуса
get_precision(explicit_recs, 50)

1.8944459302843921e-06

In [13]:
%%time

# обучаем модель для неявного фидбека
implicit_als = ALS(
    implicitPrefs=True,
    userCol="user_id",
    itemCol="artist_id",
    ratingCol="plays",
    coldStartStrategy="drop",
    seed=42
).fit(train)

CPU times: user 50 ms, sys: 20 ms, total: 70 ms
Wall time: 1min 14s


In [14]:
# оцениваем RMSE
predictions = implicit_als.transform(test)
rmse = evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="plays",
    predictionCol="prediction"
).evaluate(predictions)
print("RMSE на тестовой выборке:", rmse)

RMSE на тестовой выборке: 4.488987785630976


In [15]:
implicit_filename = "/data/other/lastfm_implicit_top50.parquet"

In [16]:
%%time

implicit_als.recommendForUserSubset(
    dataset=test.select("user_id").distinct(),
    numItems=50
).write.mode("overwrite").parquet(implicit_filename)

CPU times: user 410 ms, sys: 90 ms, total: 500 ms
Wall time: 37min 25s


In [17]:
implicit_recs = (
    spark
    .read
    .parquet(implicit_filename)
    .cache()
)

In [18]:
# точность модели для неявного фидбека гораздо выше
get_precision(implicit_recs, 50)

0.04110735936524919

In [19]:
get_precision(implicit_recs, 1)

0.0960679103147009