In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer

from pyspark.sql.functions import col, explode
import pandas as pd

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
ratings = spark.read.parquet('train_sample1.parquet')
ratings.createOrReplaceTempView('ratings')

In [4]:
indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numer")
indexed_prelim = indexer.fit(ratings).transform(ratings)

indexer_2 = StringIndexer(inputCol="track_id", outputCol="track_id_numer")
indexed = indexer_2.fit(indexed_prelim).transform(indexed_prelim)

df_dropped = indexed.drop('user_id')
df_dropped = df_dropped.drop('track_id')
df_dropped.show()

+-----+-----------------+-------------+--------------+
|count|__index_level_0__|user_id_numer|track_id_numer|
+-----+-----------------+-------------+--------------+
|    2|              107|     222596.0|        3947.0|
|    1|              156|     238051.0|        4071.0|
|    1|              342|     267078.0|        2582.0|
|    1|              406|      41261.0|        5217.0|
|    1|              443|      41261.0|        2556.0|
|    2|              574|       4240.0|         731.0|
|    1|              610|       4240.0|        8026.0|
|    5|              630|       4240.0|        3878.0|
|    6|              956|       4240.0|         573.0|
|    1|             1019|       4240.0|        9901.0|
|    1|             1081|      12196.0|       82088.0|
|    1|             1108|      12196.0|       67786.0|
|    1|             1213|      12196.0|       67307.0|
|    1|             1278|      12196.0|        5306.0|
|    2|             1327|     254074.0|       16847.0|
|    1|   

In [5]:
(training, test) = df_dropped.randomSplit([0.8, 0.2])

In [6]:
#training.show()

In [6]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_numer", itemCol="track_id_numer", ratingCol="count",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("Root-mean-square error = " + str(rmse))

# Generate top 10 song recommendations for each user

### ADJUST FOR 500 SONGS 

userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each song
songRecs = model.recommendForAllItems(10)

Root-mean-square error = 6.9732870424197895


In [8]:
#### THIS WILL BE REMOVED ####

# Generate top 10 movie recommendations for a specified set of users
users = df_dropped.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
songs = df_dropped.select(als.getItemCol()).distinct().limit(3)
songSubSetRecs = model.recommendForItemSubset(songs, 10)

In [9]:
userSubsetRecs.show()

+-------------+--------------------+
|user_id_numer|     recommendations|
+-------------+--------------------+
|       262469|[{28588, 57.25669...|
|        45953|[{50608, 246.8295...|
|       278371|[{54627, 279.1140...|
+-------------+--------------------+



In [10]:
userSubsetRecs_col = userSubsetRecs.toPandas()

In [11]:
type(userSubsetRecs_col)

pandas.core.frame.DataFrame

In [12]:
noop = userSubsetRecs_col['recommendations'][0]


In [13]:
song_ids = []
play_counts = []
for n in range(len(noop)):
    song_ids.append(noop[n][0])
    play_counts.append(noop[n][1])



In [14]:
song_ids

[28588, 50608, 62245, 40636, 22981, 19750, 3595, 1757, 60450, 2930]

In [18]:
print(type(my_list))
test = userSubsetRecs.select(['recommendations'][0]).show()
print(test)

+--------------------+
|     recommendations|
+--------------------+
|[{28588, 57.25669...|
|[{50608, 246.8295...|
|[{54627, 279.1140...|
+--------------------+

None


In [27]:
userSubsetRecs2 = userSubsetRecs.select(userSubsetRecs.user_id_numer,explode(userSubsetRecs.recommendations.track_id_numer))

In [36]:
userSubsetRecs2.show()

+-------------+-----+
|user_id_numer|  col|
+-------------+-----+
|       262469|28588|
|       262469|50608|
|       262469|62245|
|       262469|40636|
|       262469|22981|
|       262469|19750|
|       262469| 3595|
|       262469| 1757|
|       262469|60450|
|       262469| 2930|
|        45953|50608|
|        45953|48884|
|        45953|16999|
|        45953|12756|
|        45953| 5314|
|        45953|15646|
|        45953|67294|
|        45953|31449|
|        45953|67491|
|        45953|46876|
+-------------+-----+
only showing top 20 rows



In [106]:
import pyspark.sql.functions as func
#userSubsetRecs2.toPandas().set_index('user_id_numer').T.to_dict('list')
#usesrSubsetRecs2 = userSubsetRecs2.toPandas()
#userSubsetRecs2.groupby('user_id_numer')['col'].apply(lambda g: g.values.tolist()).to_dict()

#userSubsetRecs2.groupBy("user_id_numer").agg(first("age", ignoreNulls = true) as "age".orderBy("id")

test = userSubsetRecs2.groupby('user_id_numer').agg(func.collect_list('col').alias("col"))

In [107]:
df_dict = test.collect()
df_dict = [{r['user_id_numer']: r['col']} for r in df_dict]
dict((key,d[key]) for d in df_dict for key in d)

{262469: [28588, 50608, 62245, 40636, 22981, 19750, 3595, 1757, 60450, 2930],
 45953: [50608, 48884, 16999, 12756, 5314, 15646, 67294, 31449, 67491, 46876],
 278371: [54627, 26092, 49471, 15646, 13067, 22906, 10400, 5314, 2360, 95422]}

Evaluation Metric

In [None]:
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
