In [1]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer

from pyspark.sql.functions import col, explode
import pandas as pd
import pyspark.sql.functions as func
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark import SparkContext, SQLContext
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator

In [71]:
! java -version

java version "1.8.0_291"
Java(TM) SE Runtime Environment (build 1.8.0_291-b10)
Java HotSpot(TM) 64-Bit Server VM (build 25.291-b10, mixed mode)


In [15]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.driver.memory", "16G").getOrCreate()

sc = spark._sc

In [57]:
trainSample = spark.read.option("inferSchema", True).parquet('train_sample1.parquet')
testSample = spark.read.option("inferSchema", True).parquet('test_sample1.parquet')
trainSample.createOrReplaceTempView('trainSample')
testSample.createOrReplaceTempView('testSample')

In [58]:
valSample = spark.read.option("inferSchema", True).parquet('val_sample1.parquet')
valSample.createOrReplaceTempView('valSample')

In [59]:
indexer_obj_1 = StringIndexer(inputCol="user_id", outputCol="user_id_numer").setHandleInvalid("keep")
indexer_model_1 = indexer_obj_1.fit(trainSample)
indexer_df_1 = indexer_model_1.transform(trainSample)

indexer_obj_2 = StringIndexer(inputCol="track_id", outputCol="track_id_numer").setHandleInvalid("keep")
indexer_model_2 = indexer_obj_2.fit(indexer_df_1)
indexer_df_2 = indexer_model_2.transform(indexer_df_1)

train_df = indexer_df_2.drop('user_id')
train_df = train_df.drop('track_id')

In [60]:
val_df_1 = indexer_model_1.transform(valSample)
val_df_2 = indexer_model_2.transform(val_df_1)

val_df = val_df_2.drop('user_id')
val_df = val_df.drop('track_id')

In [61]:
train_df.show()

+-----+-----------------+-------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|
+-----+-----------------+-------------+--------------+
|    2|              107|     222596.0|        3947.0|
|    1|              156|     238051.0|        4071.0|
|    1|              342|     267078.0|        2582.0|
|    1|              406|      41261.0|        5217.0|
|    1|              443|      41261.0|        2556.0|
|    2|              574|       4240.0|         731.0|
|    1|              610|       4240.0|        8026.0|
|    5|              630|       4240.0|        3878.0|
|    6|              956|       4240.0|         573.0|
|    1|             1019|       4240.0|        9901.0|
|    1|             1081|      12196.0|       82088.0|
|    1|             1108|      12196.0|       67786.0|
|    1|             1213|      12196.0|       67307.0|
|    1|             1278|      12196.0|        5306.0|
|    2|             1327|     254074.0|       16847.0|
|    1|   

In [40]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_numer", itemCol="track_id_numer", ratingCol= "count",
          coldStartStrategy="drop", implicitPrefs = True)

In [None]:
# Add hyperparameters and their respective values to param_grid
#param_grid = ParamGridBuilder().addGrid(als.rank, [10, 50, 100, 150]).addGrid(als.regParam, [.01, .05, .1, .15]).build()

# Define evaluator as RMSE and print length of evaluator
#evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") 
#print ("Num models to be tested: ", len(param_grid))

In [9]:
# Build cross validation using CrossValidator
#cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
#print(cv)

CrossValidator_1ac4c96c4e54


In [23]:
#Fit cross validator to the 'train' dataset
#model = cv.fit(train_df)

#Extract best model from the cv model above
#best_model = model.bestModel

In [41]:
model = als.fit(train_df)

In [42]:
val_transformed = model.transform(val_df)

In [53]:
val_transformed.show()

+-----+-----------------+-------------+--------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|    prediction|
+-----+-----------------+-------------+--------------+--------------+
|    3|            54181|     193744.0|        5300.0|   3.138392E-8|
|    2|            41307|     171297.0|       37768.0| -3.387087E-14|
|    2|           104577|     280417.0|       15382.0| -6.4594503E-9|
|    2|           130462|     326013.0|       14874.0| -1.6975665E-8|
|    1|            11298|      40677.0|       19998.0|   1.123447E-8|
|    1|           111471|     292829.0|       15983.0|  -3.743013E-9|
|    1|            65462|     213052.0|       12727.0|  -6.703247E-6|
|    1|            91268|     257834.0|        5251.0|   6.656493E-9|
|    1|            59503|     202200.0|       34735.0|   -2.25641E-9|
|    1|            82500|     242760.0|          34.0|   0.049588334|
|    5|            94847|     263791.0|       26258.0|-1.1735475E-22|
|    1|           12

In [None]:
# Print best_model
#print(type(best_model))

# Complete the code below to extract the ALS model parameters
#print("**Best Model**")

# # Print "Rank"
#print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
#print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
#print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [43]:
# for each user, sort track ids by count
val_true = val_df.orderBy('count')

# flatten to group by user id and get list of true track ids
val_true_flatten = val_true.groupby('user_id_numer').agg(func.collect_list('track_id_numer').alias("track_id_numer"))

# add to dictionary
val_true_dict = val_true_flatten.collect()
val_true_dict = [{r['user_id_numer']: r['track_id_numer']} for r in val_true_dict]
val_true_dict = dict((key,d[key]) for d in val_true_dict for key in d)

Exception ignored in: <function JavaModelWrapper.__del__ at 0x7fba312fe160>
Traceback (most recent call last):
  File "/Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/mllib/common.py", line 137, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'RankingMetrics' object has no attribute '_sc'


In [44]:
val_true_dict

{144345.0: [2576.0],
 257834.0: [5251.0, 1748.0],
 303341.0: [106998.0],
 279263.0: [5665.0],
 259239.0: [6919.0],
 231190.0: [106998.0],
 293951.0: [2662.0, 85544.0],
 159194.0: [4265.0],
 220558.0: [106998.0],
 321044.0: [18028.0],
 289159.0: [508.0],
 120784.0: [64175.0],
 314898.0: [21924.0, 28231.0],
 216365.0: [23.0, 69.0, 18.0],
 123501.0: [53918.0],
 223776.0: [71.0],
 147473.0: [7539.0, 25750.0],
 75511.0: [37293.0],
 304923.0: [22201.0],
 261625.0: [7574.0],
 295799.0: [40.0],
 157069.0: [106998.0],
 230320.0: [16226.0],
 292319.0: [9474.0],
 129140.0: [106998.0],
 111524.0: [9113.0],
 202200.0: [34735.0],
 265663.0: [975.0],
 290401.0: [29.0, 1.0, 9636.0],
 103678.0: [85936.0],
 213052.0: [12727.0],
 112524.0: [4.0],
 324257.0: [171.0],
 326013.0: [14874.0],
 309446.0: [63374.0],
 133623.0: [1013.0],
 270912.0: [22723.0],
 252645.0: [9436.0],
 76493.0: [5219.0],
 105880.0: [7848.0],
 305804.0: [11.0],
 193744.0: [5300.0],
 168872.0: [4099.0],
 320425.0: [5439.0],
 198823.0: 

In [62]:
#https://stackoverflow.com/questions/59390481/how-to-implement-ranking-metrics-of-pyspark
#https://stackoverflow.com/questions/67345691/apply-stringindexer-to-several-columns-in-multiple-dataset

In [45]:
### model transform before recommend for UserSubset
### recommend for distinct users in validation
### implicit prefs = true ???
users = val_transformed.select(als.getUserCol()).distinct()

In [46]:
val_preds = model.recommendForUserSubset(users, 10)
val_preds_explode = val_preds.select(val_preds.user_id_numer,explode(val_preds.recommendations.track_id_numer))

val_preds_flatten = val_preds_explode.groupby('user_id_numer').agg(func.collect_list('col').alias("col"))

val_preds_dict = val_preds_flatten.collect()
val_preds_dict = [{r['user_id_numer']: r['col']} for r in val_preds_dict]
val_preds_dict = dict((key,d[key]) for d in val_preds_dict for key in d)

In [64]:
val_preds_dict

{76493: [6, 21, 226, 3, 115, 37, 582, 704, 198, 25],
 326772: [122, 26, 96, 5, 103, 83, 827, 9, 6, 14],
 266502: [25, 1, 29, 13, 32, 11, 107, 5, 96, 9],
 111524: [12, 31, 11, 5, 40, 21, 182, 207, 42, 122],
 40677: [2, 151, 58, 11, 88, 34, 4, 83, 18, 207],
 104610: [5, 10, 67, 201, 153, 4, 29, 64, 364, 79],
 180124: [65, 26, 8, 51, 117, 23, 25, 7, 19, 13],
 252645: [30, 39, 22, 20, 3, 80, 16, 8, 75, 153],
 103476: [7, 26, 25, 117, 16, 3, 23, 14, 702, 49],
 216224: [1, 7, 29, 49, 13, 2, 65, 32, 3, 25],
 217733: [145, 10, 0, 16, 13, 107, 99, 30, 39, 130],
 323823: [0, 4, 40, 18, 151, 80, 76, 351, 8603, 22981],
 108774: [5, 26, 122, 96, 13, 25, 6, 51, 9, 11],
 232176: [145, 0, 10, 16, 13, 107, 130, 99, 56, 39],
 171332: [6, 13, 26, 103, 96, 0, 122, 114, 117, 24],
 39790: [16, 30, 7, 39, 117, 219, 56, 145, 10, 18],
 304923: [4, 6, 76, 65, 51, 34, 98, 114, 26, 45],
 289159: [8, 2, 88, 22, 34, 0, 1, 9, 52, 40],
 279263: [54, 83, 6, 22, 36, 4, 58, 2, 103, 782],
 133760: [30, 7, 13, 117, 24, 16

In [67]:
labels_list = []

for user in val_preds_dict.keys():
    labels_list.append((val_preds_dict[user], [int(i) for i in val_true_dict[user]]))

labels = sc.parallelize(labels_list)
metrics = RankingMetrics(labels)
#print(metrics.meanAveragePrecision)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 576.0 failed 1 times, most recent failure: Lost task 0.0 in stage 576.0 (TID 14092) (harlans-mbp executor driver): org.apache.spark.SparkException: 
Error from python worker:
  dyld: Library not loaded: /System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation
    Referenced from: /Library/Frameworks/Python.framework/Versions/3.6/Resources/Python.app/Contents/MacOS/Python
    Reason: image not found
PYTHONPATH was:
  /Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip:/Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/jars/spark-core_2.12-3.1.1.jar
org.apache.spark.SparkException: EOFException occurred while reading the port number from pyspark.daemon's stdout
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:217)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:132)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:105)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: 
Error from python worker:
  dyld: Library not loaded: /System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation
    Referenced from: /Library/Frameworks/Python.framework/Versions/3.6/Resources/Python.app/Contents/MacOS/Python
    Reason: image not found
PYTHONPATH was:
  /Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip:/Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/Users/harlanhutton/opt/anaconda3/lib/python3.8/site-packages/pyspark/jars/spark-core_2.12-3.1.1.jar
org.apache.spark.SparkException: EOFException occurred while reading the port number from pyspark.daemon's stdout
	at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:217)
	at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:132)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:105)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [66]:
labels.collect()

[([6, 21, 226, 3, 115, 37, 582, 704, 198, 25], [5219]),
 ([122, 26, 96, 5, 103, 83, 827, 9, 6, 14], [18967]),
 ([25, 1, 29, 13, 32, 11, 107, 5, 96, 9], [63]),
 ([12, 31, 11, 5, 40, 21, 182, 207, 42, 122], [9113]),
 ([2, 151, 58, 11, 88, 34, 4, 83, 18, 207], [19998]),
 ([5, 10, 67, 201, 153, 4, 29, 64, 364, 79], [56086]),
 ([65, 26, 8, 51, 117, 23, 25, 7, 19, 13], [370]),
 ([30, 39, 22, 20, 3, 80, 16, 8, 75, 153], [9436]),
 ([7, 26, 25, 117, 16, 3, 23, 14, 702, 49], [18799]),
 ([1, 7, 29, 49, 13, 2, 65, 32, 3, 25], [7370]),
 ([145, 10, 0, 16, 13, 107, 99, 30, 39, 130], [412]),
 ([0, 4, 40, 18, 151, 80, 76, 351, 8603, 22981], [75104]),
 ([5, 26, 122, 96, 13, 25, 6, 51, 9, 11], [26177]),
 ([145, 0, 10, 16, 13, 107, 130, 99, 56, 39], [435]),
 ([6, 13, 26, 103, 96, 0, 122, 114, 117, 24], [395]),
 ([16, 30, 7, 39, 117, 219, 56, 145, 10, 18], [43290]),
 ([4, 6, 76, 65, 51, 34, 98, 114, 26, 45], [22201]),
 ([8, 2, 88, 22, 34, 0, 1, 9, 52, 40], [508]),
 ([54, 83, 6, 22, 36, 4, 58, 2, 103, 782],