In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc

data_dir = "../data/proccessed/dump.csv"
spark = SparkSession.builder \
    .appName("recommend-ML") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


df = spark.read.csv(data_dir, header=True, inferSchema=True)
cols = df.columns

df = df.withColumn("albumId", col("albumId").cast("integer"))
df = df.withColumn("rating", col("rating").cast("integer"))
df = df.withColumn("userId", col("userId").cast("integer"))

df.printSchema()
df.show(5)


your 131072x1 screen size is bogus. expect trouble
23/05/10 16:43:02 WARN Utils: Your hostname, BryanDesktop resolves to a loopback address: 127.0.1.1; using 172.30.8.97 instead (on interface eth0)
23/05/10 16:43:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/10 16:43:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)

+-------+-------+--------+------+------+--------------------+
|trackId|albumId|artistId|userId|rating|              genres|
+-------+-------+--------+------+------+--------------------+
| 204650| 177418|  131552|199810|    50|                  []|
|   9774|  79500|  158282|199810|    50|['242383', '207648']|
|   9774|  79500|  158282|199810|    50|['242383', '20764...|
|  26374| 153568|  158282|199810|    50| ['81520', '242383']|
| 271229| 293464|  279143|199811|    70| ['173655', '98154']|
+-------+-------+--------+------+------+--------------------+
only showing top 5 rows



In [2]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType


def parse_genres(genres_str):
    return genres_str.strip('[]').replace(' ', '').split(',')


parse_genres_udf = udf(parse_genres, ArrayType(StringType()))

df = df.withColumn('genre_ids', parse_genres_udf(df['genres']))

cv = CountVectorizer(inputCol="genre_ids", outputCol="ohe_features")
cv_model = cv.fit(df)
df_ohe = cv_model.transform(df)


num_topics = 5  # Choose the number of topics based on the desired lower dimensionality
lda = LDA(k=num_topics, featuresCol="ohe_features")
lda_model = lda.fit(df_ohe)

# Get the genre score for each row
genre_scores = lda_model.transform(df_ohe)


23/05/10 16:44:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [3]:
print(genre_scores.printSchema())
print(genre_scores.show(100))

df = genre_scores


root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- genre_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

None
+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
|trackId|albumId|artistId|userId|rating|              genres|           genre_ids|        ohe_features|   topicDistribution|
+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
| 204650| 177418|  131552|199810|    50|                  []|                  []|     (202,[0],[1.0])|[0.09460871167530...|
|   9774|  79500|  158282|199810|    50|['242383', '207648']|['242383',

In [4]:
stages = []
numericCols = ['albumId', 'trackId', 'artistId']
assemblerInputs = numericCols + ['topicDistribution']
assembler = VectorAssembler(
    inputCols=assemblerInputs, outputCol="features", handleInvalid="skip")
stages += [assembler]


In [5]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['features'] + cols
df = df.select(selectedCols)
df.printSchema()


root
 |-- features: vector (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
pd.DataFrame(df.take(5), columns=df.columns).transpose()


Unnamed: 0,0,1,2,3,4
features,"[177418.0, 204650.0, 131552.0, 0.0946087116753...","[79500.0, 9774.0, 158282.0, 0.0628382951765337...","[79500.0, 9774.0, 158282.0, 0.0475864076789650...","[153568.0, 26374.0, 158282.0, 0.06282562035546...","[293464.0, 271229.0, 279143.0, 0.4027227286107..."
trackId,204650,9774,9774,26374,271229
albumId,177418,79500,79500,153568,293464
artistId,131552,158282,158282,158282,279143
userId,199810,199810,199810,199810,199811
rating,50,50,50,50,70
genres,[],"['242383', '207648']","['242383', '207648', '47898']","['81520', '242383']","['173655', '98154']"


In [7]:
train, test = df.randomSplit([0.7, 0.3], seed=2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

                                                                                

Training Dataset Count: 14908520




Test Dataset Count: 6389279


                                                                                

In [8]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train = train.repartition(50)

# Fit the ALS model
als = ALS(userCol="userId", itemCol="trackId", ratingCol="rating",
          coldStartStrategy="nan", nonnegative=True)

# Define the hyperparameter search grid
param_grid = ParamGridBuilder() \
    .addGrid(als.maxIter, [15]) \
    .addGrid(als.regParam, [0.1]) \
    .build()

# Define the evaluation metric
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction")

# Define the cross-validator
cross_validator = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3
)

# Run the hyperparameter search
cv_model = cross_validator.fit(train)

# Get the best model
best_model = cv_model.bestModel

# Print the optimal hyperparameters
print("Optimal maxIter: ", best_model._java_obj.parent().getMaxIter())
print("Optimal regParam: ", best_model._java_obj.parent().getRegParam())


                                                                                

Optimal maxIter:  15
Optimal regParam:  0.1


                                                                                

In [9]:
# Set the path where you want to save the model
model_path = "../models/ALS"

# Save the trained ALS model to the specified path, overwriting if the file already exists
best_model.write().overwrite().save(model_path)


                                                                                

In [10]:
# Make predictions on the test set
predictions = best_model.transform(test)



predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- prediction: float (nullable = false)



In [11]:
# Convert the predicted ratings to binary values (0 or 1) based on a threshold
threshold = 50
predictions = predictions.withColumn(
    "final_prediction", (predictions["prediction"] >= threshold).cast("double"))

predictions = predictions.withColumn(
    "recommendation", (predictions["rating"] >= threshold).cast("double"))
    
predictions.show(5)

predictions.printSchema()




+--------------------+-------+-------+--------+------+------+------+----------+----------------+--------------+
|            features|trackId|albumId|artistId|userId|rating|genres|prediction|final_prediction|recommendation|
+--------------------+-------+-------+--------+------+------+------+----------+----------------+--------------+
|[1068.0,101561.0,...| 101561|   1068|  131552|202641|     0|    []| 12.713423|             0.0|           0.0|
|[1842.0,4327.0,13...|   4327|   1842|  131552|199855|    50|    []| 26.405258|             0.0|           1.0|
|[2780.0,270330.0,...| 270330|   2780|   88853|200878|     0|    []| 0.7448565|             0.0|           0.0|
|[2780.0,270330.0,...| 270330|   2780|   88853|200878|     0|    []| 0.7448565|             0.0|           0.0|
|[2780.0,270330.0,...| 270330|   2780|   88853|200878|     0|    []| 0.7448565|             0.0|           0.0|
+--------------------+-------+-------+--------+------+------+------+----------+----------------+--------

                                                                                

In [12]:
# Evaluate the model using the area under the ROC curve
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="final_prediction", labelCol="recommendation", metricName="areaUnderROC")
area_under_roc = evaluator.evaluate(predictions)
print(f"Area under ROC curve: {area_under_roc:.4f}")



Area under ROC curve: 0.8495


                                                                                