In [1]:
import os
import time 
import datetime
from itertools import product 

import numpy as np
import pandas as pd
import pprint

from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from pyspark.sql.functions import (
    max, avg, sum, count, countDistinct,
    percentile_approx, col, asc, desc, collect_list,
    lit, rand, when, to_date, collect_set, explode
)
from pyspark.ml.evaluation import RankingEvaluator, RegressionEvaluator
from pyspark.ml.recommendation import ALS 

from scipy.stats import loguniform, randint

spark = SparkSession.builder.master('spark://cm001:61086').getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptive.enabled", False)
# 16 cores times 8 cpus = 128 partitions * 2 = 384 partitions
spark.conf.set("spark.sql.shuffle.partitions", 256)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/05 21:06:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def gen_utiltiy_matrix(interactions, tracks):
    interactions.createOrReplaceTempView('interactions')
    tracks.createOrReplaceTempView('tracks')

    listens_per_user_track = spark.sql(
        """
        SELECT user_id,universal_id,sum(num_listens) as num_listens
        FROM interactions
        LEFT JOIN tracks
        ON tracks.recording_msid=interactions.recording_msid
        GROUP BY user_id,universal_id
        """
    )

    listens_per_user = listens_per_user_track.select(
        listens_per_user_track.user_id, listens_per_user_track.num_listens
    ).groupBy('user_id').agg(
        sum(listens_per_user_track.num_listens).alias('total_listens')
    )

    listens_per_user = listens_per_user.withColumn(
        'use_for_fit',
        when(
            listens_per_user.total_listens >= 500, True
        ).otherwise(
            False
        )
    )

    normed_listens_per_user_track = listens_per_user_track.join(listens_per_user, how='left', on='user_id')
    normed_listens_per_user_track = normed_listens_per_user_track.withColumn(
        "prop_listens",
        col("num_listens")/col("total_listens")
    ).select(
        ['user_id', 'universal_id', 'prop_listens', 'use_for_fit']
    ).orderBy(
        col('user_id').asc(),
        col('prop_listens').desc()
    )

    return normed_listens_per_user_track

def calc_performance_metrics_als(predicted, actual, calc_ndcg: bool = False):
    actual_compressed = actual.groupBy(
        'user_id'
    ).agg(
        collect_list(col('universal_id').astype('double')).alias('universal_id'),
        collect_list(col('prop_listens').astype('double')).alias('prop_listens')
    )

    predicted_compressed = predicted.withColumn(
        "recommendations", explode(col("recommendations"))
    ).select("user_id", "recommendations.universal_id", "recommendations.rating")

    predicted_compressed = predicted_compressed.withColumn(
        "rn", F.row_number().over(Window.partitionBy("user_id").orderBy(F.col("rating").desc()))
    ).groupBy("user_id").agg(F.collect_list(F.col("universal_id")).astype('array<double>').alias("predicted_universal_id"))

    results = actual_compressed.join(
        predicted_compressed,
        how='inner',
        on='user_id'
    )
    
    mapAtK = RankingEvaluator(
        predictionCol='predicted_universal_id',
        labelCol='universal_id',
        metricName='meanAveragePrecisionAtK',
        k=100
    )
    
    if calc_ndcg:
        ndcgAtK = RankingEvaluator(
            predictionCol='predicted_universal_id',
            labelCol='universal_id',
            metricName='ndcgAtK',
            k=100
        )
        return (mapAtK.evaluate(results), ndcgAtK.evaluate(results))
    
    return mapAtK.evaluate(results)

In [3]:
interactions_train = spark.read.parquet('interactions_split_train.parquet')
interactions_val = spark.read.parquet('interactions_split_val.parquet')
# interactions_test = spark.read.parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/interactions_test.parquet")

tracks_train = spark.read.parquet('tracks_train.parquet')
# tracks_test = spark.read.parquet('tracks_test.parquet')

                                                                                

In [4]:
utility_mat_train = gen_utiltiy_matrix(interactions_train, tracks_train)
utility_mat_val = gen_utiltiy_matrix(interactions_val, tracks_train)
# utility_mat_test = gen_utiltiy_matrix(interactions_test, tracks_test)
# utility_mat_train = spark.read.parquet('utility_mat_train_sample_nr.parquet')

utility_mat_train = utility_mat_train.filter(
    utility_mat_train.use_for_fit
)
# utility_mat_train.sample(withReplacement=False, fraction=0.1, seed=69).write.parquet('utility_mat_train_sample_nr.parquet')

In [None]:
# Handle different parameters 
SEED = 69
TRIALS = 100
MAXITER = 10

# Utilize random search to find optimal hyperparameters 
# NOTE: alpha is not tuned 

training_results = []
start = time.perf_counter()

for r in range(10, 100, 10):
    als = ALS(
        maxIter=MAXITER, 
        implicitPrefs=True,
        nonnegative = True, 
        rank=r, 
        seed=SEED, 
        userCol='user_id',
        itemCol='universal_id',
        ratingCol='prop_listens',
        coldStartStrategy='drop'
    ) 
    
    # Calculate MAP
    model = als.fit(utility_mat_train)
    
#     predictions_train =  model.recommendForUserSubset(utility_mat_train.select('user_id').distinct(), 100)
    predictions_val =  model.recommendForUserSubset(utility_mat_val.select('user_id').distinct(), 100)
    
#     map_train, ndcg_train = calc_performance_metrics_als(predictions_train, utility_mat_train)
    map_val = calc_performance_metrics_als(predictions_val, utility_mat_val)

    elapsed = datetime.timedelta(seconds=time.perf_counter() - start)
    start = time.perf_counter()
    
    training_results.append({
        "maxIter": MAXITER, 
        "seed": SEED,
        "elapsed": str(elapsed),
        "rank": r, 
        "map_val": map_val,
        "trial": r + 1,
#         "map_train": map_train,
#         "ndcg_train": ndcg_train,
#         "ndcg_val": ndcg_val,
    })
    
    pprint.pprint(training_results[-1], width=1)
    pd.DataFrame(training_results).to_csv('./results_runs/als_results_rank_step_10.csv')

                                                                                

{'elapsed': '0:12:58.035119',
 'map_val': 0.08993301627044326,
 'maxIter': 10,
 'rank': 10,
 'seed': 69,
 'trial': 11}


                                                                                

{'elapsed': '0:18:01.599043',
 'map_val': 0.10091204552261097,
 'maxIter': 10,
 'rank': 20,
 'seed': 69,
 'trial': 21}


                                                                                10]

{'elapsed': '0:26:24.011091',
 'map_val': 0.10810727895940926,
 'maxIter': 10,
 'rank': 30,
 'seed': 69,
 'trial': 31}


                                                                                

{'elapsed': '0:43:47.679752',
 'map_val': 0.11507368454919253,
 'maxIter': 10,
 'rank': 40,
 'seed': 69,
 'trial': 41}


                                                                                10]

{'elapsed': '0:55:03.763422',
 'map_val': 0.12051337855220745,
 'maxIter': 10,
 'rank': 50,
 'seed': 69,
 'trial': 51}




23/05/06 00:56:47 ERROR TaskSchedulerImpl: Lost executor 3 on 10.32.35.27: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 00:56:47 WARN TaskSetManager: Lost task 1565.0 in stage 3205.0 (TID 49599) (10.32.35.27 executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 00:56:47 WARN TaskSetManager: Lost task 1573.0 in stage 3205.0 (TID 49607) (10.32.35.27 executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 00:56:47 WARN TaskSetManager: Lost task 1582.0 in stage 3205.0 (TID 49616) (10.32.35.27 executor 3): ExecutorLostFailure (



23/05/06 00:56:52 WARN TaskSetManager: Lost task 1591.0 in stage 3205.0 (TID 49625) (10.32.35.53 executor 0): FetchFailed(null, shuffleId=224, mapIndex=-1, mapId=-1, reduceId=1, message=
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 224 partition 1
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1701)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1648)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1647)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1647)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1290)
	at org.apache.spark.MapOutputTrackerWorke



23/05/06 00:56:52 WARN TaskSetManager: Lost task 1488.0 in stage 3205.0 (TID 49522) (10.32.35.140 executor 1): FetchFailed(BlockManagerId(3, 10.32.35.27, 45425, None), shuffleId=252, mapIndex=0, mapId=46616, reduceId=148, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1166)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:904)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Comp

                                                                                2560]]]0]

{'elapsed': '2:45:25.576188',
 'map_val': 0.11966398607673849,
 'maxIter': 10,
 'rank': 60,
 'seed': 69,
 'trial': 61}


[Stage 3205:====>  (1467 + -563) / 2560][Stage 3745:==>      (778 + 128) / 2560] 1) / 128]

23/05/06 04:18:13 ERROR TaskSchedulerImpl: Lost executor 1 on 10.32.35.140: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 04:18:13 WARN TaskSetManager: Lost task 895.0 in stage 3745.0 (TID 60455) (10.32.35.140 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 04:18:13 WARN TaskSetManager: Lost task 894.0 in stage 3745.0 (TID 60454) (10.32.35.140 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 04:18:13 WARN TaskSetManager: Lost task 897.0 in stage 3745.0 (TID 60457) (10.32.35.140 executor 1): ExecutorLostFailure 

[Stage 3205:====>  (1467 + -563) / 2560][Stage 3745:==>     (778 + -227) / 2560]

23/05/06 04:18:18 WARN TaskSetManager: Lost task 748.0 in stage 3745.0 (TID 60308) (10.32.35.19 executor 4): FetchFailed(BlockManagerId(1, 10.32.35.140, 40547, None), shuffleId=294, mapIndex=0, mapId=58526, reduceId=74, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1166)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:904)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Comple

                                                                                0 + 33) / 256]]]]

{'elapsed': '3:34:01.375763',
 'map_val': 0.12451986650468397,
 'maxIter': 10,
 'rank': 70,
 'seed': 69,
 'trial': 71}


[Stage 3205:(1467 + -563) / 2560][Stage 3745:(778 + -339) / 2560][Stage 4279:(319 + 128) / 2560]

23/05/06 08:11:38 ERROR TaskSchedulerImpl: Lost executor 7 on 10.32.35.34: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 08:11:38 WARN TaskSetManager: Lost task 413.0 in stage 4279.0 (TID 71319) (10.32.35.34 executor 7): ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 08:11:38 WARN TaskSetManager: Lost task 359.0 in stage 4279.0 (TID 71265) (10.32.35.34 executor 7): ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/06 08:11:38 WARN TaskSetManager: Lost task 431.0 in stage 4279.0 (TID 71337) (10.32.35.34 executor 7): ExecutorLostFailure (exe

[Stage 3205:(1467 + -563) / 2560][Stage 3745:(778 + -339) / 2560][Stage 4279:(319 + -7) / 2560]]

23/05/06 08:11:43 WARN TaskSetManager: Lost task 300.0 in stage 4279.0 (TID 71206) (10.32.35.144 executor 2): FetchFailed(BlockManagerId(7, 10.32.35.34, 45009, None), shuffleId=336, mapIndex=0, mapId=69648, reduceId=30, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1166)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:904)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Comple

[Stage 3205:(1467 + -563) / 2560][Stage 3745:(778 + -339) / 2560][Stage 4247:(31 + 1) / 32] 2560]

23/05/06 08:11:44 WARN TaskSetManager: Lost task 359.1 in stage 4279.0 (TID 71367) (10.32.35.34 executor 10): FetchFailed(null, shuffleId=308, mapIndex=-1, mapId=-1, reduceId=9, message=
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 308 partition 9
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1701)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1648)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1647)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1647)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1290)
	at org.apache.spark.MapOutputTrackerWorke

[Stage 3205:(1467 + -563) / 2560][Stage 3745:(778 + -339) / 2560][Stage 4279:(81 + 120) / 2376]0]