In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc

als_model_dir = "../models/ALS"

spark = SparkSession.builder.appName('recommend-ML').getOrCreate()

In [3]:
data_dir = "../data/proccessed/dump.csv"

df = spark.read.csv(data_dir, header=True, inferSchema=True)
cols = df.columns

df = df.withColumn("albumId", col("albumId").cast("integer"))
df = df.withColumn("rating", col("rating").cast("integer"))
df = df.withColumn("userId", col("userId").cast("integer"))

df.printSchema()
df.show(5)


                                                                                

root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)

+-------+-------+--------+------+------+--------------------+
|trackId|albumId|artistId|userId|rating|              genres|
+-------+-------+--------+------+------+--------------------+
| 204650| 177418|  131552|199810|    50|                  []|
|   9774|  79500|  158282|199810|    50|['242383', '207648']|
|   9774|  79500|  158282|199810|    50|['242383', '20764...|
|  26374| 153568|  158282|199810|    50| ['81520', '242383']|
| 271229| 293464|  279143|199811|    70| ['173655', '98154']|
+-------+-------+--------+------+------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

split_genres = udf(lambda x: x.split(','), ArrayType(StringType()))
df = df.withColumn("genres_array", split_genres("genres"))

cv = CountVectorizer(inputCol="genres_array", outputCol="ohe_features")
cv_model = cv.fit(df)
df_ohe = cv_model.transform(df)


num_topics = 5  # Choose the number of topics based on the desired lower dimensionality
lda = LDA(k=num_topics, featuresCol="ohe_features")
lda_model = lda.fit(df_ohe)

# Get the genre score for each row
genre_scores = lda_model.transform(df_ohe)
print("loaded:", df.count())
print(genre_scores.printSchema())
print(genre_scores.show(10))




23/05/07 21:13:25 WARN MemoryStore: Not enough space to cache rdd_50_6 in memory! (computed 28.3 MiB so far)
23/05/07 21:13:25 WARN BlockManager: Persisting block rdd_50_6 to disk instead.
23/05/07 21:13:25 WARN MemoryStore: Not enough space to cache rdd_50_7 in memory! (computed 28.5 MiB so far)
23/05/07 21:13:25 WARN BlockManager: Persisting block rdd_50_7 to disk instead.
23/05/07 21:13:26 WARN MemoryStore: Not enough space to cache rdd_50_1 in memory! (computed 43.4 MiB so far)
23/05/07 21:13:26 WARN BlockManager: Persisting block rdd_50_1 to disk instead.
23/05/07 21:13:26 WARN MemoryStore: Not enough space to cache rdd_50_3 in memory! (computed 42.0 MiB so far)
23/05/07 21:13:26 WARN BlockManager: Persisting block rdd_50_3 to disk instead.
23/05/07 21:13:26 WARN MemoryStore: Not enough space to cache rdd_50_2 in memory! (computed 43.2 MiB so far)
23/05/07 21:13:26 WARN BlockManager: Persisting block rdd_50_2 to disk instead.
23/05/07 21:13:27 WARN MemoryStore: Not enough space to

loaded: 22128002
root
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- genres_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

None
+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
|trackId|albumId|artistId|userId|rating|              genres|        genres_array|        ohe_features|   topicDistribution|
+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
| 204650| 177418|  131552|199810|    50|                  []|                [[]]|     (733,[0],[1.0])|[0.09673169041557...|
|   9774|  79500|  158282|199810|    50|['242383', 

                                                                                

In [19]:
df = genre_scores

# Fill missing values with 0
df = df.fillna(0, subset=['albumId', 'trackId', 'artistId'])

stages = []
numericCols = ['trackId', 'albumId', 'artistId', "rating"]
assemblerInputs = numericCols + ['topicDistribution']
assembler = VectorAssembler(
    inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

#apply
cols = df.columns
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['features'] + cols
df = df.select(selectedCols)
print("loaded:", df.count())
df.printSchema()
df.show()




loaded: 22128002
root
 |-- features: vector (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- genres_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_features: vector (nullable = true)
 |-- topicDistribution: vector (nullable = true)

+--------------------+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
|            features|trackId|albumId|artistId|userId|rating|              genres|        genres_array|        ohe_features|   topicDistribution|
+--------------------+-------+-------+--------+------+------+--------------------+--------------------+--------------------+--------------------+
|[204650.0,177418....| 204650| 177418|  131552|199810|    50|                 

                                                                                

In [21]:
test2 = spark.read.csv('../data/test2_new.txt', sep='|', header=True, inferSchema=True)
test2 = test2.withColumnRenamed('Recommendation', 'recommendation')
test2.printSchema()
test2.show()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- recommendation: integer (nullable = true)

+------+-------+--------------+
|userId|trackId|recommendation|
+------+-------+--------------+
|200031|  30877|             1|
|200031|   8244|             1|
|200031| 130183|             0|
|200031| 198762|             0|
|200031|  34503|             1|
|200031| 227283|             0|
|200032| 218377|             0|
|200032| 110262|             0|
|200032|  18681|             1|
|200032| 138493|             1|
|200032|  64167|             1|
|200032|  22820|             0|
|200055|  52198|             1|
|200055| 233815|             0|
|200055| 175557|             0|
|200055|  59101|             0|
|200055|  56695|             1|
|200055| 134398|             1|
|200065| 179571|             0|
|200065| 196286|             0|
+------+-------+--------------+
only showing top 20 rows



In [23]:
result = test2.filter(test2.userId == 200031)

# Show the filtered result
result.show()


+------+-------+--------------+
|userId|trackId|recommendation|
+------+-------+--------------+
|200031|  30877|             1|
|200031|   8244|             1|
|200031| 130183|             0|
|200031| 198762|             0|
|200031|  34503|             1|
|200031| 227283|             0|
+------+-------+--------------+



In [26]:
result = df.join(test2.select("userId", "trackId", "recommendation"), on=[
    "userId", "trackId"], how="left")


result = result.filter(result.userId == 200031)


result.show()


[Stage 125:>                                                        (0 + 1) / 1]

+------+-------+--------------------+-------+--------+------+--------------------+--------------------+--------------------+--------------------+--------------+
|userId|trackId|            features|albumId|artistId|rating|              genres|        genres_array|        ohe_features|   topicDistribution|recommendation|
+------+-------+--------------------+-------+--------+------+--------------------+--------------------+--------------------+--------------------+--------------+
|200031|  25703|[25703.0,1589.0,1...|   1589|  131552|    50|          ['214110']|        [['214110']]|    (733,[62],[1.0])|[0.09673194620325...|          null|
|200031| 103229|[103229.0,134869....| 134869|  131552|    90|          ['214110']|        [['214110']]|    (733,[62],[1.0])|[0.09673194620325...|          null|
|200031| 192723|[192723.0,132319....| 132319|  131552|    90|['176858', '25159...|[['176858',  '251...|(733,[7,26,81,97]...|[0.03836904545788...|          null|
|200031| 159984|[159984.0,164412..

23/05/07 21:39:41 WARN PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 125 (TID 412): Attempting to kill Python Worker
                                                                                

In [8]:
from pyspark.sql.functions import max, min

# Calculate the highest and lowest user ID
max_user_id = test2.agg(max("userId")).collect()[0][0]
min_user_id = test2.agg(min("userId")).collect()[0][0]

print("Highest User ID:", max_user_id)
print("Lowest User ID:", min_user_id)


Highest User ID: 212234
Lowest User ID: 200031


In [9]:
train = df.where(col("userID").between(200596, 212234))
test = df.where(col("userID").between(200031, 200563))
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

                                                                                

Training Dataset Count: 5105188




Test Dataset Count: 242109


                                                                                