In [1]:
import time 
import datetime
import optuna 

import pandas as pd
import pprint

from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from pyspark.sql.functions import (
    sum, col, collect_list,
    when, explode
)

from pyspark.ml.evaluation import RankingEvaluator 
from pyspark.ml.recommendation import ALS 


spark = SparkSession.builder.master('spark://cm010:28844').getOrCreate()
# spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
# spark.conf.set("spark.sql.adaptive.enabled", False)
# 16 cores times 8 cpus = 128 partitions * 2 = 384 partitions
# spark.conf.set("spark.sql.shuffle.partitions", 256)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/11 15:35:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def gen_utiltiy_matrix(interactions, tracks):
    interactions.createOrReplaceTempView('interactions')
    tracks.createOrReplaceTempView('tracks')

    listens_per_user_track = spark.sql(
        """
        SELECT user_id,universal_id,sum(num_listens) as num_listens
        FROM interactions
        LEFT JOIN tracks
        ON tracks.recording_msid=interactions.recording_msid
        GROUP BY user_id,universal_id
        """
    )

    listens_per_user = listens_per_user_track.select(
        listens_per_user_track.user_id, listens_per_user_track.num_listens
    ).groupBy('user_id').agg(
        sum(listens_per_user_track.num_listens).alias('total_listens')
    )

    listens_per_user = listens_per_user.withColumn(
        'use_for_fit',
        when(
            listens_per_user.total_listens >= 500, True
        ).otherwise(
            False
        )
    )

    normed_listens_per_user_track = listens_per_user_track.join(listens_per_user, how='left', on='user_id')
    normed_listens_per_user_track = normed_listens_per_user_track.withColumn(
        "prop_listens",
        col("num_listens")/col("total_listens")
    ).select(
        ['user_id', 'universal_id', 'prop_listens', 'use_for_fit']
    ).orderBy(
        col('user_id').asc(),
        col('prop_listens').desc()
    )

    return normed_listens_per_user_track

def calc_performance_metrics_als(predicted, actual, calc_ndcg: bool = False):
    actual_compressed = actual.groupBy(
        'user_id'
    ).agg(
        collect_list(col('universal_id').astype('double')).alias('universal_id'),
        collect_list(col('prop_listens').astype('double')).alias('prop_listens')
    )

    predicted_compressed = predicted.withColumn(
        "recommendations", explode(col("recommendations"))
    ).select("user_id", "recommendations.universal_id", "recommendations.rating")

    predicted_compressed = predicted_compressed.withColumn(
        "rn", F.row_number().over(Window.partitionBy("user_id").orderBy(F.col("rating").desc()))
    ).groupBy("user_id").agg(F.collect_list(F.col("universal_id")).astype('array<double>').alias("predicted_universal_id"))

    results = actual_compressed.join(
        predicted_compressed,
        how='inner',
        on='user_id'
    )
    
    mapAtK = RankingEvaluator(
        predictionCol='predicted_universal_id',
        labelCol='universal_id',
        metricName='meanAveragePrecisionAtK',
        k=100
    )
    
    if calc_ndcg:
        ndcgAtK = RankingEvaluator(
            predictionCol='predicted_universal_id',
            labelCol='universal_id',
            metricName='ndcgAtK',
            k=100
        )
        return (mapAtK.evaluate(results), ndcgAtK.evaluate(results))
    
    return mapAtK.evaluate(results)

In [None]:
interactions_train = spark.read.parquet('interactions_split_train.parquet')
interactions_val = spark.read.parquet('interactions_split_val.parquet')
interactions_test = spark.read.parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/interactions_test.parquet")

tracks_train = spark.read.parquet('tracks_train.parquet')
tracks_test = spark.read.parquet('tracks_test.parquet')

                                                                                

In [None]:
utility_mat_train = gen_utiltiy_matrix(interactions_train, tracks_train)
utility_mat_val = gen_utiltiy_matrix(interactions_val, tracks_train)
utility_mat_train = utility_mat_train.filter(
    utility_mat_train.use_for_fit
)
utility_mat_test = gen_utiltiy_matrix(interactions_test, tracks_test)

# utility_mat_train = spark.read.parquet('utility_mat_train_sample.parquet')
# utility_mat_train.sample(withReplacement=True, fraction=0.1, seed=69).write.parquet('utility_mat_train_sample.parquet')

In [None]:
# Handle different parameters 
SEED = 69
TRIALS = 500
MAXITER = 10

# Utilize random search to find optimal hyperparameters 
# NOTE: alpha is not tuned 

training_results = []
start = time.perf_counter()

def objective(trial):
    rank = trial.suggest_int('rank', 1, 50)
    alpha = trial.suggest_float('alpha', 1e-3, 1e3, log=True)
    regParam = trial.suggest_float('regParam', 1e-2, 1e5, log=True)
    
    als = ALS(
        maxIter=MAXITER, 
        alpha=alpha,
        regParam=regParam,
        implicitPrefs=True,
        nonnegative = True, 
        rank=rank, 
        seed=SEED, 
        userCol='user_id',
        itemCol='universal_id',
        ratingCol='prop_listens',
        coldStartStrategy='drop'
    ) 
    
    # Calculate MAP
    model = als.fit(utility_mat_train)
    
#     predictions_train =  model.recommendForUserSubset(utility_mat_train.select('user_id').distinct(), 100)
    predictions_val =  model.recommendForUserSubset(utility_mat_val.select('user_id').distinct(), 100)
    
#     map_train, ndcg_train = calc_performance_metrics_als(predictions_train, utility_mat_train)
    map_val = calc_performance_metrics_als(predictions_val, utility_mat_val)
    
    return map_val
    
study = optuna.create_study(
    study_name='als_search',
    direction='maximize', 
    sampler=optuna.samplers.TPESampler(seed=SEED), 
    storage='sqlite:///optuna.db', 
    load_if_exists=True
)
study.optimize(objective, n_trials=TRIALS)

[32m[I 2023-05-10 13:32:27,469][0m Using an existing study with name 'als_search' instead of creating a new one.[0m
[32m[I 2023-05-10 13:45:26,175][0m Trial 41 finished with value: 0.07904716251052674 and parameters: {'rank': 9, 'alpha': 139.96114660196292, 'regParam': 0.15459224428752139}. Best is trial 35 with value: 0.1371739926525655.[0m
[32m[I 2023-05-10 14:39:34,088][0m Trial 42 finished with value: 0.13690620298938463 and parameters: {'rank': 50, 'alpha': 437.6193598547283, 'regParam': 0.02554729728351489}. Best is trial 35 with value: 0.1371739926525655.[0m
[32m[I 2023-05-10 14:47:39,143][0m Trial 43 finished with value: 0.03071384935010459 and parameters: {'rank': 1, 'alpha': 453.4580027901268, 'regParam': 0.07685896205562705}. Best is trial 35 with value: 0.1371739926525655.[0m
[32m[I 2023-05-10 15:35:30,030][0m Trial 44 finished with value: 0.13296560561570792 and parameters: {'rank': 48, 'alpha': 503.00581166072936, 'regParam': 0.02464430608585112}. Best is tr

23/05/10 19:36:46 ERROR TaskSchedulerImpl: Lost executor 0 on 10.32.34.127: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/10 19:36:46 WARN TaskSetManager: Lost task 6.0 in stage 6773.0 (TID 49404) (10.32.34.127 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/10 19:36:46 WARN TaskSetManager: Lost task 9.0 in stage 6773.0 (TID 49407) (10.32.34.127 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/10 19:36:46 WARN TaskSetManager: Lost task 0.0 in stage 6773.0 (TID 49398) (10.32.34.127 executor 0): ExecutorLostFailure (execu

	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorId(MapOutputTracker.scala:1252)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:140)
	at org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)
	at org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)
	at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:378)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1508)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storag

[Stage 6848:>                                                     (0 + 10) / 10]

## Train Using the Best Params 

In [None]:
study = optuna.create_study(
    study_name='als_search',
    direction='maximize', 
    sampler=optuna.samplers.TPESampler(seed=69), 
    storage='sqlite:///optuna.db', 
    load_if_exists=True
)

In [None]:
als = ALS(
    maxIter=10, 
    alpha=study.best_params['alpha'],
    regParam=study.best_params['regParam'],
    implicitPrefs=True,
    nonnegative = True, 
    rank=study.best_params['rank'], 
    seed=69, 
    userCol='user_id',
    itemCol='universal_id',
    ratingCol='prop_listens',
    coldStartStrategy='drop'
) 

# Calculate MAP
model = als.fit(utility_mat_train)

predictions_train =  model.recommendForUserSubset(utility_mat_train.select('user_id').distinct(), 100)
predictions_val =  model.recommendForUserSubset(utility_mat_val.select('user_id').distinct(), 100)
predictions_test =  model.recommendForUserSubset(utility_mat_test.select('user_id').distinct(), 100)

%time map_train, ndcg_train = calc_performance_metrics_als(predictions_train, utility_mat_train, calc_ndcg=True)
%time map_val, ndcg_val = calc_performance_metrics_als(predictions_val, utility_mat_val, calc_ndcg=True)
%time map_test, ndcg_test = calc_performance_metrics_als(predictions_test, utility_mat_test, calc_ndcg=True)

In [None]:
study.best_params

In [None]:
map_train, ndcg_train

In [None]:
map_val, ndcg_val

In [None]:
map_test, ndcg_test