In [1]:
import os
import time 
import datetime
from itertools import product 

import numpy as np
import pandas as pd
import pprint

from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from pyspark.sql.functions import (
    max, avg, sum, count, countDistinct,
    percentile_approx, col, asc, desc, collect_list,
    lit, rand, when, to_date, collect_set, explode
)
from pyspark.ml.evaluation import RankingEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS 

from scipy.stats import loguniform, randint

spark = SparkSession.builder.master('spark://cm001:61086').getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptive.enabled", False)
# 16 cores times 8 cpus = 128 partitions * 2 = 384 partitions
spark.conf.set("spark.sql.shuffle.partitions", 256)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/05 09:49:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def gen_utiltiy_matrix(interactions, tracks):
    interactions.createOrReplaceTempView('interactions')
    tracks.createOrReplaceTempView('tracks')

    listens_per_user_track = spark.sql(
        """
        SELECT user_id,universal_id,sum(num_listens) as num_listens
        FROM interactions
        LEFT JOIN tracks
        ON tracks.recording_msid=interactions.recording_msid
        GROUP BY user_id,universal_id
        """
    )

    listens_per_user = listens_per_user_track.select(
        listens_per_user_track.user_id, listens_per_user_track.num_listens
    ).groupBy('user_id').agg(
        sum(listens_per_user_track.num_listens).alias('total_listens')
    )

    listens_per_user = listens_per_user.withColumn(
        'use_for_fit',
        when(
            listens_per_user.total_listens >= 500, True
        ).otherwise(
            False
        )
    )

    normed_listens_per_user_track = listens_per_user_track.join(listens_per_user, how='left', on='user_id')
    normed_listens_per_user_track = normed_listens_per_user_track.withColumn(
        "prop_listens",
        col("num_listens")/col("total_listens")
    ).select(
        ['user_id', 'universal_id', 'prop_listens', 'use_for_fit']
    ).orderBy(
        col('user_id').asc(),
        col('prop_listens').desc()
    )

    return normed_listens_per_user_track

def calc_performance_metrics_als(predicted, actual, calc_ndcg: bool = False):
    actual_compressed = actual.groupBy(
        'user_id'
    ).agg(
        collect_list(col('universal_id').astype('double')).alias('universal_id'),
        collect_list(col('prop_listens').astype('double')).alias('prop_listens')
    )

    predicted_compressed = predicted.withColumn(
        "recommendations", explode(col("recommendations"))
    ).select("user_id", "recommendations.universal_id", "recommendations.rating")

    predicted_compressed = predicted_compressed.withColumn(
        "rn", F.row_number().over(Window.partitionBy("user_id").orderBy(F.col("rating").desc()))
    ).groupBy("user_id").agg(F.collect_list(F.col("universal_id")).astype('array<double>').alias("predicted_universal_id"))

    results = actual_compressed.join(
        predicted_compressed,
        how='inner',
        on='user_id'
    )
    
    mapAtK = RankingEvaluator(
        predictionCol='predicted_universal_id',
        labelCol='universal_id',
        metricName='meanAveragePrecisionAtK',
        k=100
    )
    
    if calc_ndcg:
        ndcgAtK = RankingEvaluator(
            predictionCol='predicted_universal_id',
            labelCol='universal_id',
            metricName='ndcgAtK',
            k=100
        )
        return (mapAtK.evaluate(results), ndcgAtK.evaluate(results))
    
    return mapAtK.evaluate(results)

In [3]:
interactions_train = spark.read.parquet('interactions_split_train.parquet')
interactions_val = spark.read.parquet('interactions_split_val.parquet')
# interactions_test = spark.read.parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/interactions_test.parquet")

tracks_train = spark.read.parquet('tracks_train.parquet')
# tracks_test = spark.read.parquet('tracks_test.parquet')

                                                                                

In [4]:
utility_mat_train = gen_utiltiy_matrix(interactions_train, tracks_train)
utility_mat_val = gen_utiltiy_matrix(interactions_val, tracks_train)
# utility_mat_test = gen_utiltiy_matrix(interactions_test, tracks_test)
# utility_mat_train = spark.read.parquet('utility_mat_train_sample_nr.parquet')

utility_mat_train = utility_mat_train.filter(
    utility_mat_train.use_for_fit
)
# utility_mat_train.sample(withReplacement=False, fraction=0.1, seed=69).write.parquet('utility_mat_train_sample_nr.parquet')

In [None]:
# Handle different parameters 
SEED = 69
TRIALS = 100
MAXITER = 10

# Utilize random search to find optimal hyperparameters 
# NOTE: alpha is not tuned 

training_results = []
start = time.perf_counter()

for r in range(10, 100, 10):
    als = ALS(
        maxIter=MAXITER, 
        implicitPrefs=True,
        nonnegative = True, 
        rank=r, 
        seed=SEED, 
        userCol='user_id',
        itemCol='universal_id',
        ratingCol='prop_listens',
        coldStartStrategy='drop'
    ) 
    
    # Calculate MAP
    model = als.fit(utility_mat_train)
    
#     predictions_train =  model.recommendForUserSubset(utility_mat_train.select('user_id').distinct(), 100)
    predictions_val =  model.recommendForUserSubset(utility_mat_val.select('user_id').distinct(), 100)
    
#     map_train, ndcg_train = calc_performance_metrics_als(predictions_train, utility_mat_train)
    map_val = calc_performance_metrics_als(predictions_val, utility_mat_val)

    elapsed = datetime.timedelta(seconds=time.perf_counter() - start)
    start = time.perf_counter()
    
    training_results.append({
        "maxIter": MAXITER, 
        "seed": SEED,
        "elapsed": str(elapsed),
        "rank": r, 
        "map_val": map_val,
        "trial": r + 1,
#         "map_train": map_train,
#         "ndcg_train": ndcg_train,
#         "ndcg_val": ndcg_val,
    })
    
    pprint.pprint(training_results[-1], width=1)
    pd.DataFrame(training_results).to_csv('./results_runs/als_results_rank_step_10.csv')

                                                                                

{'elapsed': '0:53:26.996693',
 'map_val': 0.12051327034679808,
 'maxIter': 10,
 'rank': 50,
 'seed': 69,
 'trial': 51}


[Stage 717:>                                                      (0 + 10) / 10]