In [1]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as func
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import RankingMetrics
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import sys
import getpass

In [2]:
spark = SparkSession.builder.config("spark.driver.memory", "16G").config('spark.executor.memory', '16G').appName('sampler').getOrCreate()
sc = spark.sparkContext

In [3]:
trainSample = spark.read.parquet('train_sample1.parquet')
testSample = spark.read.parquet('test_sample1.parquet')
valSample = spark.read.parquet('val_sample1.parquet')
    
valSample.createOrReplaceTempView('valSample')
trainSample.createOrReplaceTempView('trainSample')
testSample.createOrReplaceTempView('testSample')

In [4]:
indexer_obj_1 = StringIndexer(inputCol="user_id", outputCol="user_id_numer").setHandleInvalid("keep")
indexer_model_1 = indexer_obj_1.fit(trainSample)
indexer_df_1 = indexer_model_1.transform(trainSample)

In [5]:
indexer_obj_2 = StringIndexer(inputCol="track_id", outputCol="track_id_numer").setHandleInvalid("keep")
indexer_model_2= indexer_obj_2.fit(indexer_df_1)
indexer_df_2 = indexer_model_2.transform(indexer_df_1)

In [6]:
train_df = indexer_df_2.drop('user_id')
train_df = train_df.drop('track_id')

In [7]:
train_df = train_df.repartition(2000)

In [8]:
val_df_1 = indexer_model_1.transform(valSample)

In [9]:
val_df_2 = indexer_model_2.transform(val_df_1)

In [10]:
val_df = val_df_2.drop('user_id')
val_df = val_df.drop('track_id')
    
val_df = val_df.repartition(5000)

In [11]:
test_df_1 = indexer_model_1.transform(testSample)
test_df_2 = indexer_model_2.transform(test_df_1)

test_df = test_df_2.drop('user_id')
test_df = test_df.drop('track_id')

test_df = test_df.repartition(5000)

In [12]:
als = ALS(userCol="user_id_numer",itemCol="track_id_numer",ratingCol="count",
                         coldStartStrategy="drop",implicitPrefs=True,rank=int(20),regParam=float(0.1))

In [13]:
print("model trained")
best_model = als.fit(train_df)
print("fitted")

model trained
fitted


In [None]:
predictions = best_model.transform(test_df)

In [None]:
predictions.first()

In [None]:
predictions.take(2)

In [None]:
predictions = predictions.orderBy('prediction', ascending = False)

In [None]:
pred_df = predictions.rdd.map(lambda p: Row(User=p[2],Predictions=p[3])).toDF()

In [None]:
pred_df.take(2)

In [None]:
pp = pred_df.groupby("User").agg(func.collect_list("Predictions"))

In [None]:
pp.show()

In [None]:
test_true = test_df.orderBy('count')
test_true_df = test_true.rdd.map(lambda p: Row(User=p[2],Predictions=p[3])).toDF()

In [None]:
rankingsRDD = (pp.join(test_true_df, 'User').rdd.map(lambda row: (row[1], row[2])))

In [None]:
metrics = RankingMetrics(rankingsRDD)

In [None]:
metrics.meanAveragePrecision

In [14]:
users = test_df.select(als.getUserCol()).distinct()
test_preds = best_model.recommendForUserSubset(users,5)
test_preds_explode = test_preds.select(test_preds.user_id_numer,func.explode(test_preds.recommendations.track_id_numer))
test_preds_flatten = test_preds_explode.groupby('user_id_numer').agg(func.collect_list('col').alias("col"))

In [15]:
test_preds_flatten.show()

+-------------+--------------------+
|user_id_numer|                 col|
+-------------+--------------------+
|       323090|[153, 8, 103, 36,...|
|       314281|[3, 7, 29, 141, 117]|
|       227152|   [0, 6, 14, 10, 2]|
|       133153|  [0, 6, 22, 38, 11]|
|        35694|  [0, 10, 11, 6, 13]|
|        92644| [4, 80, 28, 34, 44]|
|       165914| [26, 11, 2, 22, 13]|
|       136625|  [23, 9, 8, 4, 151]|
|       225755|[5, 13, 18, 11, 130]|
|       287645|   [1, 3, 6, 8, 126]|
|       208696|[104, 14, 12, 24, 0]|
|       275698|   [25, 6, 7, 21, 4]|
|       167242|[15, 76, 117, 96,...|
|       118605|[25, 28, 103, 88,...|
|       258065|[975, 34, 51, 0, 11]|
|        23136|  [11, 32, 2, 0, 42]|
|       157866| [4, 5, 207, 18, 71]|
|       274716|[4, 25, 115, 182,...|
|       229338|  [5, 0, 6, 107, 75]|
|       245018|  [3, 5, 2, 217, 16]|
+-------------+--------------------+
only showing top 20 rows



In [17]:
test_true_flatten = test_df.groupby('user_id_numer').agg(func.collect_list('track_id_numer').alias("track_id_numer"))
test_true_flatten = test_true_flatten.repartition(5000)

In [18]:
rankingsRDD = (test_preds_flatten.join(test_true_flatten, 'user_id_numer').rdd.map(lambda row: (row[1], row[2])))

In [19]:
metrics = RankingMetrics(rankingsRDD)

print("Ranking Metrics called")
    
MAP = metrics.meanAveragePrecision
print(MAP)

Ranking Metrics called
0.0023275145469659184
