In [1]:
import os
import time 
import datetime
from itertools import product 

import numpy as np
import pandas as pd
import pprint

from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from pyspark.sql.functions import (
    max, avg, sum, count, countDistinct,
    percentile_approx, col, asc, desc, collect_list,
    lit, rand, when, to_date, collect_set, explode
)
from pyspark.ml.evaluation import RankingEvaluator, RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.ml.tuning import ParamGridBuilder 
from pyspark.ml.recommendation import ALS 

from scipy.stats import loguniform, randint

spark = SparkSession.builder.master('spark://cs420:14946').getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptive.enabled", False)
# 16 cores times 8 cpus = 128 partitions * 2 = 384 partitions
spark.conf.set("spark.sql.shuffle.partitions", 256)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/03 14:26:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def gen_utiltiy_matrix(interactions, tracks):
    interactions.createOrReplaceTempView('interactions')
    tracks.createOrReplaceTempView('tracks')

    listens_per_user_track = spark.sql(
        """
        SELECT user_id,universal_id,sum(num_listens) as num_listens
        FROM interactions
        LEFT JOIN tracks
        ON tracks.recording_msid=interactions.recording_msid
        GROUP BY user_id,universal_id
        """
    )

    listens_per_user = listens_per_user_track.select(
        listens_per_user_track.user_id, listens_per_user_track.num_listens
    ).groupBy('user_id').agg(
        sum(listens_per_user_track.num_listens).alias('total_listens')
    )

    listens_per_user = listens_per_user.withColumn(
        'use_for_fit',
        when(
            listens_per_user.total_listens >= 500, True
        ).otherwise(
            False
        )
    )

    normed_listens_per_user_track = listens_per_user_track.join(listens_per_user, how='left', on='user_id')
    normed_listens_per_user_track = normed_listens_per_user_track.withColumn(
        "prop_listens",
        col("num_listens")/col("total_listens")
    ).select(
        ['user_id', 'universal_id', 'prop_listens', 'use_for_fit']
    ).orderBy(
        col('user_id').asc(),
        col('prop_listens').desc()
    )

    return normed_listens_per_user_track

def calc_performance_metrics_als(predicted, actual, calc_ndcg: bool = False):
    actual_compressed = actual.groupBy(
        'user_id'
    ).agg(
        collect_list(col('universal_id').astype('double')).alias('universal_id'),
        collect_list(col('prop_listens').astype('double')).alias('prop_listens')
    )

    predicted_compressed = predicted.withColumn(
        "recommendations", explode(col("recommendations"))
    ).select("user_id", "recommendations.universal_id", "recommendations.rating")

    predicted_compressed = predicted_compressed.withColumn(
        "rn", F.row_number().over(Window.partitionBy("user_id").orderBy(F.col("rating").desc()))
    ).groupBy("user_id").agg(F.collect_list(F.col("universal_id")).astype('array<double>').alias("predicted_universal_id"))

    results = actual_compressed.join(
        predicted_compressed,
        how='inner',
        on='user_id'
    )
    
    mapAtK = RankingEvaluator(
        predictionCol='predicted_universal_id',
        labelCol='universal_id',
        metricName='meanAveragePrecisionAtK',
        k=100
    )
    
    if calc_ndcg:
        ndcgAtK = RankingEvaluator(
            predictionCol='predicted_universal_id',
            labelCol='universal_id',
            metricName='ndcgAtK',
            k=100
        )
        return (mapAtK.evaluate(results), ndcgAtK.evaluate(results))
    
    return mapAtK.evaluate(results)

In [3]:
interactions_train = spark.read.parquet('interactions_split_train.parquet')
interactions_val = spark.read.parquet('interactions_split_val.parquet')
# interactions_test = spark.read.parquet("/scratch/work/courses/DSGA1004-2021/listenbrainz/interactions_test.parquet")

tracks_train = spark.read.parquet('tracks_train.parquet')
# tracks_test = spark.read.parquet('tracks_test.parquet')

                                                                                

In [4]:
utility_mat_train = gen_utiltiy_matrix(interactions_train, tracks_train)
utility_mat_val = gen_utiltiy_matrix(interactions_val, tracks_train)
# utility_mat_test = gen_utiltiy_matrix(interactions_test, tracks_test)

utility_mat_train = utility_mat_train.filter(
    utility_mat_train.use_for_fit
)

In [5]:
# Handle different parameters 
SEED = 69
TRIALS = 100
MAXITER = 15

# Utilize random search to find optimal hyperparameters 
# NOTE: alpha is not tuned 
# rank_min  = 1 
# rank_max = 50

# reg_param_min = 1e-5 
# reg_param_max = 1e5

training_results = []
start = time.perf_counter()

for t in range(TRIALS):
    regParam = loguniform(reg_param_min, reg_param_max).rvs()
    rank = randint(rank_min, rank_max).rvs()
    
    als = ALS(
        maxIter=MAXITER, 
        implicitPrefs=True,
        nonnegative = True, 
        regParam=regParam, 
        rank=rank, 
        seed=SEED, 
        userCol='user_id',
        itemCol='universal_id',
        ratingCol='prop_listens',
        coldStartStrategy='drop'
    ) 
    
    # Calculate MAP
    model = als.fit(utility_mat_train)
    
#     predictions_train =  model.recommendForUserSubset(utility_mat_train.select('user_id').distinct(), 100)
    predictions_val =  model.recommendForUserSubset(utility_mat_val.select('user_id').distinct(), 100)
    
#     map_train, ndcg_train = calc_performance_metrics_als(predictions_train, utility_mat_train)
    map_val = calc_performance_metrics_als(predictions_val, utility_mat_val)

    elapsed = datetime.timedelta(seconds=time.perf_counter() - start)
    
    training_results.append({
        "maxIter": MAXITER, 
        "seed": SEED,
        "elapsed": str(elapsed),
        "regParam": regParam,
        "rank": rank, 
        "map_val": map_val,
        "trial": t + 1, 
#         "map_train": map_train,
#         "ndcg_train": ndcg_train,
#         "ndcg_val": ndcg_val,
    })
    
    pprint.pprint(training_results[-1], width=1)
    pd.DataFrame(training_results).to_csv('als_results.csv')

                                                                                

{'elapsed': '0:36:48.286592',
 'map_val': 0.12367018235407264,
 'maxIter': 15,
 'rank': 36,
 'regParam': 0.00025427866837477255,
 'seed': 69,
 'trial': 1}




23/05/03 16:02:39 ERROR TaskSchedulerImpl: Lost executor 5 on 10.32.35.1: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/03 16:02:39 WARN TaskSetManager: Lost task 457.0 in stage 1825.0 (TID 15847) (10.32.35.1 executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/03 16:02:39 WARN TaskSetManager: Lost task 370.0 in stage 1825.0 (TID 15760) (10.32.35.1 executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
23/05/03 16:02:39 WARN TaskSetManager: Lost task 456.0 in stage 1825.0 (TID 15846) (10.32.35.1 executor 5): ExecutorLostFailure (executo



23/05/03 16:02:44 WARN TaskSetManager: Lost task 434.0 in stage 1825.0 (TID 15824) (10.32.34.167 executor 4): FetchFailed(null, shuffleId=66, mapIndex=-1, mapId=-1, reduceId=4, message=
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 66 partition 4
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1701)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1648)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1647)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1647)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1290)
	at org.apache.spark.MapOutputTrackerWorker.

[Stage 1774:(13 + 3) / 16][Stage 1781:(13 + 2) / 15][Stage 1782:(15 + 1) / 16]0]

23/05/03 16:02:46 WARN TaskSetManager: Lost task 451.1 in stage 1825.0 (TID 15891) (10.32.35.1 executor 8): FetchFailed(null, shuffleId=66, mapIndex=-1, mapId=-1, reduceId=1, message=
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 66 partition 1
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1701)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1648)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1647)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1647)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1290)
	at org.apache.spark.MapOutputTrackerWorker.ge

ERROR:root:KeyboardInterrupt while sending command.:=>      (370 + -146) / 2560]2560]0]
Traceback (most recent call last):
  File "/ext3/pyspark/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/ext3/pyspark/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/ext3/pyspark/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt

