In [1]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer

from pyspark.sql.functions import col, explode
import pandas as pd
import pyspark.sql.functions as func
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark import SparkContext, SQLContext
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
! java -version

java version "1.8.0_231"
Java(TM) SE Runtime Environment (build 1.8.0_231-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.231-b11, mixed mode)


In [3]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.driver.memory", "16G").getOrCreate()

sc = spark._sc

In [4]:
trainSample = spark.read.option("inferSchema", True).parquet('train_sample1.parquet')
testSample = spark.read.option("inferSchema", True).parquet('test_sample1.parquet')
trainSample.createOrReplaceTempView('trainSample')
testSample.createOrReplaceTempView('testSample')

In [5]:
valSample = spark.read.option("inferSchema", True).parquet('val_sample1.parquet')
valSample.createOrReplaceTempView('valSample')

In [6]:
indexer_obj_1 = StringIndexer(inputCol="user_id", outputCol="user_id_numer").setHandleInvalid("keep")
indexer_model_1 = indexer_obj_1.fit(trainSample)
indexer_df_1 = indexer_model_1.transform(trainSample)

indexer_obj_2 = StringIndexer(inputCol="track_id", outputCol="track_id_numer").setHandleInvalid("keep")
indexer_model_2 = indexer_obj_2.fit(indexer_df_1)
indexer_df_2 = indexer_model_2.transform(indexer_df_1)

train_df = indexer_df_2.drop('user_id')
train_df = train_df.drop('track_id')

In [7]:
val_df_1 = indexer_model_1.transform(valSample)
val_df_2 = indexer_model_2.transform(val_df_1)

val_df = val_df_2.drop('user_id')
val_df = val_df.drop('track_id')

In [8]:
train_df.show()

+-----+-----------------+-------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|
+-----+-----------------+-------------+--------------+
|    2|              107|     222596.0|        3947.0|
|    1|              156|     238051.0|        4071.0|
|    1|              342|     267078.0|        2582.0|
|    1|              406|      41261.0|        5217.0|
|    1|              443|      41261.0|        2556.0|
|    2|              574|       4240.0|         731.0|
|    1|              610|       4240.0|        8026.0|
|    5|              630|       4240.0|        3878.0|
|    6|              956|       4240.0|         573.0|
|    1|             1019|       4240.0|        9901.0|
|    1|             1081|      12196.0|       82088.0|
|    1|             1108|      12196.0|       67786.0|
|    1|             1213|      12196.0|       67307.0|
|    1|             1278|      12196.0|        5306.0|
|    2|             1327|     254074.0|       16847.0|
|    1|   

In [9]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_numer", itemCol="track_id_numer", ratingCol= "count",
          coldStartStrategy="drop", implicitPrefs = True)

In [10]:
# Add hyperparameters and their respective values to param_grid
#param_grid = ParamGridBuilder().addGrid(als.rank, [10, 50, 100, 150]).addGrid(als.regParam, [.01, .05, .1, .15]).build()

# Define evaluator as RMSE and print length of evaluator
#evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") 
#print ("Num models to be tested: ", len(param_grid))

In [11]:
# Build cross validation using CrossValidator
#cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
#print(cv)

In [12]:
#Fit cross validator to the 'train' dataset
#model = cv.fit(train_df)

#Extract best model from the cv model above
#best_model = model.bestModel

In [13]:
model = als.fit(train_df)

In [14]:
val_transformed = model.transform(val_df)

In [15]:
val_transformed.show()

+-----+-----------------+-------------+--------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|    prediction|
+-----+-----------------+-------------+--------------+--------------+
|    3|            54181|     193744.0|        5300.0|  1.5363752E-7|
|    2|            41307|     171297.0|       37768.0|-2.8077768E-14|
|    2|           104577|     280417.0|       15382.0| -7.5688654E-8|
|    2|           130462|     326013.0|       14874.0| -1.3598846E-9|
|    1|            11298|      40677.0|       19998.0|   4.256871E-8|
|    1|           111471|     292829.0|       15983.0|  3.1739255E-8|
|    1|            65462|     213052.0|       12727.0|  -5.541946E-6|
|    1|            91268|     257834.0|        5251.0|   5.473293E-9|
|    1|            59503|     202200.0|       34735.0| 1.7099427E-10|
|    1|            82500|     242760.0|          34.0|   0.058092523|
|    5|            94847|     263791.0|       26258.0|-5.6779747E-23|
|    1|           12

In [16]:
# Print best_model
#print(type(best_model))

# Complete the code below to extract the ALS model parameters
#print("**Best Model**")

# # Print "Rank"
#print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
#print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
#print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [17]:
# for each user, sort track ids by count
val_true = val_df.orderBy('count')

# flatten to group by user id and get list of true track ids
val_true_flatten = val_true.groupby('user_id_numer').agg(func.collect_list('track_id_numer').alias("track_id_numer"))

# add to dictionary
val_true_dict = val_true_flatten.collect()
val_true_dict = [{r['user_id_numer']: r['track_id_numer']} for r in val_true_dict]
val_true_dict = dict((key,d[key]) for d in val_true_dict for key in d)

In [18]:
val_true_dict

{144345.0: [2576.0],
 257834.0: [5251.0, 1748.0],
 303341.0: [106998.0],
 279263.0: [5665.0],
 259239.0: [6919.0],
 231190.0: [106998.0],
 293951.0: [2662.0, 85544.0],
 159194.0: [4265.0],
 220558.0: [106998.0],
 321044.0: [18028.0],
 289159.0: [508.0],
 120784.0: [64175.0],
 314898.0: [21924.0, 28231.0],
 216365.0: [23.0, 69.0, 18.0],
 123501.0: [53918.0],
 223776.0: [71.0],
 147473.0: [7539.0, 25750.0],
 75511.0: [37293.0],
 304923.0: [22201.0],
 261625.0: [7574.0],
 295799.0: [40.0],
 157069.0: [106998.0],
 230320.0: [16226.0],
 292319.0: [9474.0],
 129140.0: [106998.0],
 111524.0: [9113.0],
 202200.0: [34735.0],
 265663.0: [975.0],
 290401.0: [29.0, 1.0, 9636.0],
 103678.0: [85936.0],
 213052.0: [12727.0],
 112524.0: [4.0],
 324257.0: [171.0],
 326013.0: [14874.0],
 309446.0: [63374.0],
 133623.0: [1013.0],
 270912.0: [22723.0],
 252645.0: [9436.0],
 76493.0: [5219.0],
 105880.0: [7848.0],
 305804.0: [11.0],
 193744.0: [5300.0],
 168872.0: [4099.0],
 320425.0: [5439.0],
 198823.0: 

In [19]:
#https://stackoverflow.com/questions/59390481/how-to-implement-ranking-metrics-of-pyspark
#https://stackoverflow.com/questions/67345691/apply-stringindexer-to-several-columns-in-multiple-dataset

In [20]:
### model transform before recommend for UserSubset
### recommend for distinct users in validation
### implicit prefs = true ???
users = val_transformed.select(als.getUserCol()).distinct()

In [21]:
val_preds = model.recommendForUserSubset(users, 10)
val_preds_explode = val_preds.select(val_preds.user_id_numer,explode(val_preds.recommendations.track_id_numer))

val_preds_flatten = val_preds_explode.groupby('user_id_numer').agg(func.collect_list('col').alias("col"))

val_preds_dict = val_preds_flatten.collect()
val_preds_dict = [{r['user_id_numer']: r['col']} for r in val_preds_dict]
val_preds_dict = dict((key,d[key]) for d in val_preds_dict for key in d)

In [22]:
dictcon= list(map(list, val_preds_dict.items()))
dfpreds = spark.createDataFrame(dictcon, ["user_id_numer", "tracks"])

dictcon2= list(map(list, val_true_dict.items()))
dftrue = spark.createDataFrame(dictcon2, ["user_id_numer", "tracks"])

rankingsRDD = (dfpreds.join(dftrue, 'user_id_numer')
               .rdd
               .map(lambda row: (row[1], row[2])))

In [28]:
metrics = RankingMetrics(rankingsRDD)

In [23]:
### OLD ### 

# labels_list = []

# for user in val_preds_dict.keys():
#     labels_list.append((val_preds_dict[user], [int(i) for i in val_true_dict[user]]))

# labels = sc.parallelize(labels_list)
# metrics = RankingMetrics(labels)
# #print(metrics.meanAveragePrecision)


In [29]:
metrics.meanAveragePrecision

0.00606909430438842