In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc

als_model_dir = "../models/ALS"
test_hierarchy_dir = '../data/testTrack_hierarchy.txt'

spark = SparkSession.builder.appName('recommend-ML').getOrCreate()

# Read the track data line by line
with open(test_hierarchy_dir, 'r') as file:
    lines = file.readlines()

track_data = []
for line in lines:
    fields = line.strip().split('|')
    # Use the second number as the trackId
    track_data.append([fields[0]] + fields[1:])

# Create a DataFrame from the track data
track_data_df = pd.DataFrame(track_data)

# Fill NaN values with an empty string
track_data_df.fillna("", inplace=True)

# Determine the maximum number of genres
max_genres = track_data_df.shape[1] - 4

# Rename columns
track_data_df.columns = ['userId', 'trackId', 'albumId', 'artistId'] + \
    [f'genreId_{i}' for i in range(1, max_genres + 1)]

track_data_df['trackId'] = pd.to_numeric(
    track_data_df['trackId'], errors='coerce')
track_data_df.dropna(subset=['trackId'], inplace=True)

track_data_df['trackId'] = track_data_df['trackId'].astype(int)

merged_df = track_data_df

# Convert genres to a list of genres for each track
merged_df['genres'] = merged_df[[
    f'genreId_{i}' for i in range(1, max_genres + 1)]].values.tolist()

# Remove empty strings from the genre lists
merged_df['genres'] = merged_df['genres'].apply(
    lambda x: [genre for genre in x if genre != ""])

# Drop individual genre columns and itemId column
merged_df.drop(columns=[f'genreId_{i}' for i in range(
    1, max_genres + 1)], inplace=True)

# Convert the Pandas DataFrame to a Spark DataFrame
predict_df = spark.createDataFrame(merged_df)

predict_df = predict_df.withColumn("albumId", col("albumId").cast("integer"))
predict_df = predict_df.withColumn("trackId", col("trackId").cast("integer"))
predict_df = predict_df.withColumn("userId", col("userId").cast("integer"))
predict_df = predict_df.withColumn(
    "artistId", col("artistId").cast("integer"))

print("loaded:", predict_df.count())
predict_df.printSchema()
# Show the Spark DataFrame
predict_df.show(5)


your 131072x1 screen size is bogus. expect trouble
23/05/04 17:25:37 WARN Utils: Your hostname, BryanDesktop resolves to a loopback address: 127.0.1.1; using 172.23.54.125 instead (on interface eth0)
23/05/04 17:25:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/04 17:25:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/04 17:25:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/04 17:25:48 WARN TaskSetManager: Stage 0 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

loaded: 120000
root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------+-------+-------+--------+--------------------+
|userId|trackId|albumId|artistId|              genres|
+------+-------+-------+--------+--------------------+
|199810| 208019| 209288|    null|                  []|
|199810|  74139| 277282|  271146|[113360, 173467, ...|
|199810|   9903|   null|    null|[33722, 123396, 7...|
|199810| 242681| 190640|  244574|[61215, 17453, 27...|
|199810|  18515| 146344|   33168|[19913, 48505, 15...|
+------+-------+-------+--------+--------------------+
only showing top 5 rows



In [2]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType


cv = CountVectorizer(inputCol="genres", outputCol="ohe_features")
cv_model = cv.fit(predict_df)
df_ohe = cv_model.transform(predict_df)


num_topics = 5  # Choose the number of topics based on the desired lower dimensionality
lda = LDA(k=num_topics, featuresCol="ohe_features")
lda_model = lda.fit(df_ohe)

# Get the genre score for each row
genre_scores = lda_model.transform(df_ohe)
print("loaded:", predict_df.count())
print(genre_scores.printSchema())
print(genre_scores.show(10))




23/05/04 17:25:59 WARN TaskSetManager: Stage 4 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:01 WARN TaskSetManager: Stage 8 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:04 WARN TaskSetManager: Stage 11 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/04 17:26:05 WARN TaskSetManager: Stage 14 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:06 WARN TaskSetManager: Stage 17 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:06 WARN TaskSetManager: Stage 20 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
23/05/04 17:26:06 WARN TaskSetManager: Stag

loaded: 120000
root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

None
+------+-------+-------+--------+--------------------+--------------------+--------------------+
|userId|trackId|albumId|artistId|              genres|        ohe_features|   topicDistribution|
+------+-------+-------+--------+--------------------+--------------------+--------------------+
|199810| 208019| 209288|    null|                  []|         (204,[],[])|[0.0,0.0,0.0,0.0,...|
|199810|  74139| 277282|  271146|[113360, 173467, ...|(204,[3,4,10,22,5...|[0.17042887369312...|
|199810|   9903|   null|    null|[33722, 123396, 7...|(204,[13,46,55,90...|[0.03771475026650...|
|199810| 242681| 190640|  244574|[61215, 17453, 27...|

In [43]:
df = genre_scores

# Fill missing values with 0
df = df.fillna(0, subset=['albumId', 'trackId', 'artistId'])

stages = []
numericCols = ['albumId', 'trackId', 'artistId']
assemblerInputs = numericCols + ['topicDistribution']
assembler = VectorAssembler(
    inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

#apply
cols = df.columns
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['features'] + cols
df = df.select(selectedCols)
print("loaded:", df.count())
df.printSchema()
df.show()


23/04/26 21:35:26 WARN TaskSetManager: Stage 422 contains a task of very large size (1009 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

loaded: 120000
root
 |-- features: vector (nullable = true)
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

+--------------------+------+-------+-------+--------+--------------------+--------------------+--------------------+
|            features|userId|trackId|albumId|artistId|              genres|        ohe_features|   topicDistribution|
+--------------------+------+-------+-------+--------+--------------------+--------------------+--------------------+
|(8,[0,1],[209288....|199810| 208019| 209288|       0|                  []|         (204,[],[])|[0.0,0.0,0.0,0.0,...|
|[277282.0,74139.0...|199810|  74139| 277282|  271146|[113360, 173467, ...|(204,[3,4,10,22,5...|[0.84018376599927...|
|[0.0,9903.

In [45]:
from pyspark.ml.recommendation import ALSModel

df_user_item = df.select("userId", "trackId")

df_user_item.show(5)


# Load the saved ALS model from the specified path
loaded_als_model = ALSModel.load(als_model_dir)


# Load the CSV file into a DataFrame
oldtest = spark.read.format("csv").option("header", "true").load("test")

oldtest = oldtest.withColumn("albumId", col("albumId").cast("integer"))
oldtest = oldtest.withColumn("trackId", col("trackId").cast("integer"))
oldtest = oldtest.withColumn("userId", col("userId").cast("integer"))

oldtest = oldtest.drop("rating")
oldtest = oldtest.drop("albumId")
oldtest = oldtest.drop("artistId")
oldtest = oldtest.drop("genres")
oldtest = oldtest.drop("features_str")

oldtest.printSchema()


# Make predictions on the test set
predictions = loaded_als_model.transform(oldtest)
predictions.show(10)


+------+-------+
|userId|trackId|
+------+-------+
|199810| 208019|
|199810|  74139|
|199810|   9903|
|199810| 242681|
|199810|  18515|
+------+-------+
only showing top 5 rows

root
 |-- trackId: integer (nullable = true)
 |-- userId: integer (nullable = true)

+-------+------+----------+
|trackId|userId|prediction|
+-------+------+----------+
| 246668|200283| 13.953331|
| 236263|202097| 10.934191|
| 236263|203637| 38.624283|
| 236263|204469|  33.57938|
|  11328|205082|  50.24372|
|  26940|200715|  6.178597|
|  26940|204409|  4.436098|
|  79025|200336| 0.8108509|
|  79025|201153| 14.228643|
|  79025|201380| 23.432201|
+-------+------+----------+
only showing top 10 rows



In [36]:
# Convert the predicted ratings to binary values (0 or 1) based on a threshold
threshold = 50
predictions = predictions.withColumn(
    "final_prediction", (predictions["prediction"] >= threshold).cast("double"))

print("loaded:", predictions.count())
predictions.printSchema()
predictions.show()






loaded: 6388958
root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- features_str: string (nullable = true)
 |-- prediction: float (nullable = false)
 |-- final_prediction: double (nullable = false)

+-------+-------+--------+------+--------------------+--------------------+----------+----------------+
|trackId|albumId|artistId|userId|              genres|        features_str|prediction|final_prediction|
+-------+-------+--------+------+--------------------+--------------------+----------+----------------+
| 246668|     35|  145948|200283|                  []|[35.0,246668.0,14...| 13.953331|             0.0|
| 236263|     49|   61215|202097|['172023', '18269...|[49.0,236263.0,61...| 10.934191|             0.0|
| 236263|     49|   61215|203637|['172023', '18269...|[49.0,236263.0,61...| 38.624283|             0.0|
| 236263|     49|   61

                                                                                

In [3]:
from pyspark.sql.functions import concat_ws, col
# Add a new column with combined userId and trackId
predictions = predictions.withColumn("userId_trackId", concat_ws(
    "_", col("userId"), col("trackId")))



# Select required columns
output_df = predictions.select("userId_trackId", "final_prediction")

output_df = output_df.withColumn(
    "final_prediction", col("final_prediction").cast("integer"))

# Reduce the number of partitions to one
output_df = output_df.coalesce(1)

# Save the DataFrame as a CSV file
output_df.write.mode("overwrite").csv("../results/als_output", header=True)


NameError: name 'predictions' is not defined