# Notes:
If you want to make big changes, like tuning parameters, please make a new notebook and rename to 'baseline_version_xx' where 'xx' is larger 1 than present version.

# Import model

In [None]:
# ! pip install pyspark

In [12]:
# import pyspark
# pyspark.__version__

In [6]:
import pyspark.sql.functions as func
from pyspark.sql import Window
from pyspark.sql import SparkSession

from pyspark import StorageLevel
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.feature import StringIndexer
# from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics

from operator import itemgetter

import getpass

# Define the Parameters

In [2]:
PATH = '/scratch/work/courses/DSGA1004-2021/MSD'
TOP=10
MODEL_PATH = 'saved_model'.format(getpass.getuser())
PREC_AT = 5

spark = SparkSession.builder.appName('quq').getOrCreate()

# Data Processing

In [3]:
# Load the Data
print('Begin loading data')

train = spark.read.parquet(PATH+'/cf_train.parquet')
val = spark.read.parquet(PATH+'/cf_validation.parquet')
test = spark.read.parquet(PATH+'/cf_test.parquet')
print('Successfully loaded the data')
print(test.show(1))

# Get Unique user and track
unique_user = train.select('user_id').distinct()
unique_track = ((train.select('track_id').distinct()) 
                .union(val.select('track_id').distinct())
                .union(test.select('track_id').distinct())).distinct()
print('Successfully get unique user/track')

# Encode string to index
user_to_index = unique_user.rdd.map(itemgetter(0)).zipWithIndex().toDF(['user_id', 'user_index'])
track_to_index = unique_track.rdd.map(itemgetter(0)).zipWithIndex().toDF(['track_id', 'track_index'])
print('Successfully get the encoding function')
train = train.join(user_to_index,['user_id'], how='left')
train = train.join(track_to_index,['track_id'], how='left')

val = val.join(user_to_index,['user_id'], how='left')
val = val.join(track_to_index,['track_id'], how='left')

test = test.join(user_to_index,['user_id'], how='left')
test = test.join(track_to_index,['track_id'], how='left')     
print('Successfully encoding the user and track')
print(test.show(1))

'''              
# Too slow to use StringIndexer() when model.fit()
# Encode string to index
indexer_user = StringIndexer(inputCol="user_id", outputCol="user_index")
tran_user = indexer_user.fit(unique_user)
indexer_track = StringIndexer(inputCol="track_id", outputCol="track_index")
tran_track = indexer_track.fit(unique_track)
print('Successfully get the encoding function')

train = tran_user.transform(train)
val = tran_user.transform(val)
test = tran_user.transform(test)
print(test.show(1))

train = tran_track.transform(train)
val = tran_track.transform(val)
test = tran_track.transform(test)   
print(test.show(1))

train = train.withColumn('user_index', train['user_index'].cast('int'))
val = val.withColumn('user_index', val['user_index'].cast('int'))
test = test.withColumn('user_index', test['user_index'].cast('int'))

train = train.withColumn('track_index', train['track_index'].cast('int'))
val = val.withColumn('track_index', val['track_index'].cast('int'))
test = test.withColumn('track_index', test['track_index'].cast('int'))
print('Successfully encoding the user and track')
print(test.show(1))
''' 

# Get Unique user index
unique_user_index_val, unique_user_index_test = val.select('user_index').distinct(), test.select('user_index').distinct()
# unique_track_index_val, unique_track_index_test = val.select('track_index').distinct(), test.select('track_index').distinct()
print('Successfully getting the unique user/track index of val/test')

Begin loading data
Successfully loaded the data
+--------------------+-----+------------------+-----------------+
|             user_id|count|          track_id|__index_level_0__|
+--------------------+-----+------------------+-----------------+
|00007a02388c208ea...|    1|TRXYDST128F92EC024|                0|
+--------------------+-----+------------------+-----------------+
only showing top 1 row

None
Successfully get unique user/track
Successfully get the encoding function
Successfully encoding the user and track
+------------------+--------------------+-----+-----------------+----------+-----------+
|          track_id|             user_id|count|__index_level_0__|user_index|track_index|
+------------------+--------------------+-----+-----------------+----------+-----------+
|TRAADQX128F422B4CF|7d2b99addaa0a1e2b...|    1|           667178|    198059|        343|
+------------------+--------------------+-----+-----------------+----------+-----------+
only showing top 1 row

None
Succ

# Train and save the model -- Sample

In [8]:
try:
    model = ALSModel.load(MODEL_PATH)
    print('Successfully load trained model')
except:
    als = ALS(userCol='user_index', itemCol='track_index', ratingCol='count', 
              implicitPrefs=True, coldStartStrategy="drop", 
              rank=10, alpha=0.1, regParam = 0.01)
    model = als.fit(train)
    model.write().overwrite().save(MODEL_PATH)
    print('Successfully train the model')

Successfully train the model


# Evaluate the model -- Sample

In [11]:
userRecs = model.recommendForUserSubset(unique_user_index_val, TOP)
print('Successfully get the recommendations')
# trackRecsl = model.recommendForItemSubset(unique_track_index_val, TOP)

pred_tracks = userRecs.rdd.map(lambda row: (row['user_index'], 
                                            [track_pred.track_index for track_pred in row['recommendations']]))
'''
# Use for loop is slowly
pred_tracks = []
for user, tracks in userRecs.collect():
    predict_tracks = [i[0] for i in tracks]
    pred_tracks.append((user, predict_tracks))
pred_tracks_rdd = spark.sparkContext.parallelize(pred_tracks)
'''

print('Successfully transform the recommendations')
print(pred_tracks_rdd.take(1))

w = Window.partitionBy('user_index').orderBy('count')
true_tracks = val.withColumn(
    'tracks', func.collect_list('track_index').over(w))\
    .groupBy('user_index')\
    .agg(func.max('tracks').alias('tracks'))
true_tracks_rdd = true_tracks.rdd.map(tuple)
print('Successfully transform the true values')
print(true_tracks_rdd.take(1))

pred_and_true_tracks = pred_tracks_rdd.join(true_tracks_rdd)
pred_and_true_tracks = pred_and_true_tracks.map(lambda tup: tup[1])
print('Successfully put recommendations and true value together')
print(pred_and_true_tracks.take(1))

metrics = RankingMetrics(pred_and_true_tracks)
print(metrics.precisionAt(PREC_AT))

Successfully get the recommendations
Successfully transform the recommendations
[(942750, [253051, 84666, 164141, 212289, 49856, 158493, 301157, 343609, 80730, 341667])]
Successfully transform the true values
[(16530, [26615, 164181, 258741, 268265, 279858, 145044, 193045, 245111, 270113, 299298, 326364, 349302])]
Successfully put recommendations and true value together
[([299298, 335900, 349302, 326364, 44097, 47952, 245111, 306974, 193045, 270114], [32498, 241350, 111950, 362720, 30578, 189388, 328315, 17120, 47952, 34331, 345537, 380358, 3837])]
0.058799999999999984


# To Do: Parameter Tuning
Begin your code here