In [47]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer

from pyspark.sql.functions import col, explode
import pandas as pd

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
ratings = spark.read.parquet('train_sample1.parquet')
ratings.createOrReplaceTempView('ratings')

In [4]:
indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numer")
indexed_prelim = indexer.fit(ratings).transform(ratings)

indexer_2 = StringIndexer(inputCol="track_id", outputCol="track_id_numer")
indexed = indexer_2.fit(indexed_prelim).transform(indexed_prelim)

indexed.show()


+--------------------+-----+------------------+-----------------+-------------+--------------+
|             user_id|count|          track_id|__index_level_0__|user_id_numer|track_id_numer|
+--------------------+-----+------------------+-----------------+-------------+--------------+
|85c1f87fea955d09b...|    2|TRHEQEQ12903CDA12D|              107|     222596.0|        3947.0|
|969cc6fb74e076a68...|    1|TRTTSRS12903CA00FF|              156|     238051.0|        4071.0|
|b64cdd1a0bd907e5e...|    1|TRFMCVT128F92F7083|              342|     267078.0|        2582.0|
|17aa9f6dbdf753831...|    1|TRXTEJZ128F422BC35|              406|      41261.0|        5217.0|
|17aa9f6dbdf753831...|    1|TRCLBXD128F932472C|              443|      41261.0|        2556.0|
|5a905f000fc1ff3df...|    2|TRGAOLV128E0789D40|              574|       4240.0|         731.0|
|5a905f000fc1ff3df...|    1|TRUWESF128F1452600|              610|       4240.0|        8026.0|
|5a905f000fc1ff3df...|    5|TRALZBV128F4250E60|   

In [5]:
df_dropped = indexed.drop('user_id')
df_dropped = df_dropped.drop('track_id')
df_dropped.show()

+-----+-----------------+-------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|
+-----+-----------------+-------------+--------------+
|    2|              107|     222596.0|        3947.0|
|    1|              156|     238051.0|        4071.0|
|    1|              342|     267078.0|        2582.0|
|    1|              406|      41261.0|        5217.0|
|    1|              443|      41261.0|        2556.0|
|    2|              574|       4240.0|         731.0|
|    1|              610|       4240.0|        8026.0|
|    5|              630|       4240.0|        3878.0|
|    6|              956|       4240.0|         573.0|
|    1|             1019|       4240.0|        9901.0|
|    1|             1081|      12196.0|       82088.0|
|    1|             1108|      12196.0|       67786.0|
|    1|             1213|      12196.0|       67307.0|
|    1|             1278|      12196.0|        5306.0|
|    2|             1327|     254074.0|       16847.0|
|    1|   

In [6]:
(training, test) = df_dropped.randomSplit([0.8, 0.2])

In [7]:
#training.show()

In [8]:
### Drop the old columns at some point

In [9]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_numer", itemCol="track_id_numer", ratingCol="count",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square error = " + str(rmse))

# Generate top 10 song recommendations for each user

### ADJUST FOR 500 SONGS 

userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each song
songRecs = model.recommendForAllItems(10)

Root-mean-square error = 6.7769339557419075


In [11]:
#### THIS WILL BE REMOVED ####

# Generate top 10 movie recommendations for a specified set of users
users = df_dropped.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
songs = df_dropped.select(als.getItemCol()).distinct().limit(3)
songSubSetRecs = model.recommendForItemSubset(songs, 10)

In [12]:
userSubsetRecs.show()

+-------------+--------------------+
|user_id_numer|     recommendations|
+-------------+--------------------+
|       262469|[{100229, 51.4293...|
|        45953|[{9831, 20.80738}...|
+-------------+--------------------+



In [53]:
userSubsetRecs_col = userSubsetRecs.toPandas()

In [54]:
type(userSubsetRecs_col)

pandas.core.frame.DataFrame

In [83]:
noop = userSubsetRecs_col['recommendations'][0]


In [99]:
song_ids = []
play_counts = []
for n in range(len(noop)):
    song_ids.append(noop[n][0])
    play_counts.append(noop[n][1])



In [100]:
song_ids

[100229, 3595, 36349, 48884, 14546, 67491, 58736, 36489, 22981, 92294]

In [14]:
type(userSubsetRecs)

pyspark.sql.dataframe.DataFrame

In [None]:
#songSubSetRecs.show()

In [None]:
#songRecs.show()

In [72]:
import pyspark.sql.functions as f
my_list = userSubsetRecs.select(f.collect_list('recommendations')).first()[0]

In [74]:
print(my_list)

[[Row(track_id_numer=100229, rating=51.42936325073242), Row(track_id_numer=3595, rating=49.366703033447266), Row(track_id_numer=36349, rating=30.645719528198242), Row(track_id_numer=48884, rating=29.265501022338867), Row(track_id_numer=14546, rating=29.14731788635254), Row(track_id_numer=67491, rating=26.684131622314453), Row(track_id_numer=58736, rating=26.053049087524414), Row(track_id_numer=36489, rating=26.046415328979492), Row(track_id_numer=22981, rating=25.817214965820312), Row(track_id_numer=92294, rating=25.41570281982422)], [Row(track_id_numer=9831, rating=20.80738067626953), Row(track_id_numer=15646, rating=14.361817359924316), Row(track_id_numer=54627, rating=13.229878425598145), Row(track_id_numer=3595, rating=13.107110023498535), Row(track_id_numer=4576, rating=11.423210144042969), Row(track_id_numer=41339, rating=9.573042869567871), Row(track_id_numer=34034, rating=9.477357864379883), Row(track_id_numer=14267, rating=9.032364845275879), Row(track_id_numer=9244, rating=8.

In [73]:
# nrecommendations = userSubsetRecs\
#     .withColumn("rec_exp", explode("recommendations"))\
#     .select('user_id_numer', col(0), col(1))
# nrecommendations.limit(10).show()

In [16]:
# split_col = pyspark.sql.functions.split(userSubsetRecs['recommendations'], '{')
# userSubsetRecs = userSubsetRecs.withColumn('rec1', split_col.getItem(0))
# userSubsetRecs = userSubsetRecs.withColumn('rec2', split_col.getItem(1))
# userSubsetRecs = userSubsetRecs.withColumn('rec3', split_col.getItem(2))

In [28]:
# userSubsetRecs.select("recommendations").rdd.flatMap(lambda x: x).collect()