In [1]:
%%bash
   export JAVA_OPTS="-Xmx128g"

In [2]:
!pip install pyspark



In [3]:
# Imports
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, rand
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans

In [4]:
# This cell checks if the file merged_data.csv exists in the current directory.
# If it doesn't, it installs the kaggle package, downloads the Spotify dataset
# from Kaggle, and unzips the downloaded file.

file_exists = os.path.exists('merged_data.csv')

if not file_exists:
    !pip install kaggle
    !kaggle datasets download -d sunnykakar/spotify-charts-all-audio-data
    !unzip spotify-charts-all-audio-data.zip

In [5]:
# Initialize SparkSession
print("Initializing Spark Session...")
spark = SparkSession.builder \
    .appName("FeatureSelectionClustering") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()
print("Spark Session initialized.")

# Load dataset
print("Loading dataset...")
df = spark.read.csv('merged_data.csv', header=True, inferSchema=True)
print("Dataset loaded.")

# Rename _c0 Column
df = df.withColumnRenamed('_c0', 'ID')

# Audio features
audio_features = [
    'af_danceability', 'af_energy', 'af_key', 'af_loudness', 'af_mode', 'af_speechiness',
    'af_acousticness', 'af_instrumentalness', 'af_liveness', 'af_valence', 'af_tempo', 'af_time_signature'
]

# Handle missing values and cast columns to float
print("Handling missing values and casting columns to float...")
for feature in audio_features:
    mean_value = df.select(avg(col(feature))).first()[0]
    df = df.fillna({feature: mean_value})
    df = df.withColumn(feature, col(feature).cast('float'))

df = df.dropna()

# Assemble features into a single vector
print("Assembling features into a single vector...")
assembler = VectorAssembler(inputCols=audio_features, outputCol="features")
df = assembler.transform(df)

# Scale the features
print("Scaling the features...")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)


Initializing Spark Session...
Spark Session initialized.
Loading dataset...
Dataset loaded.
Handling missing values and casting columns to float...
Assembling features into a single vector...
Scaling the features...


In [6]:
# Train KMeans model with k=7
print("Training KMeans model with k=7...")
kmeans = KMeans(featuresCol="scaledFeatures", k=7, seed=1)
kmeans_model = kmeans.fit(df)
df = kmeans_model.transform(df)
print("model trained.")

Training KMeans model with k=7...
model trained.


In [7]:
# Show 5 random songs
print("Showing 5 random songs:")
random_songs = df.select("title", "artist", "album").orderBy(rand()).limit(5)
random_songs.show(truncate=False)

# Ask the user to choose a song
song_choices = random_songs.collect()
print("Choose a song by entering the corresponding number:")
for i, row in enumerate(song_choices):
    print(f"{i+1}: {row['title']} by {row['artist']} from the album {row['album']}")

choice = int(input("Enter the number of the song you choose: ")) - 1
chosen_song = song_choices[choice]

Showing 5 random songs:
+----------------------------------------------+-------------------+------------------------+
|title                                         |artist             |album                   |
+----------------------------------------------+-------------------+------------------------+
|Старшие (feat. Polyana)                       |OG Buda            |FREERIO                 |
|Freaky                                        |Tory Lanez         |Freaky                  |
|Meant to Be (feat. Florida Georgia Line)      |Bebe Rexha         |All Your Fault: Pt. 2   |
|Leave Before You Love Me (with Jonas Brothers)|Marshmello         |Leave Before You Love Me|
|CROWN                                         |TOMORROW X TOGETHER|The Dream Chapter: STAR |
+----------------------------------------------+-------------------+------------------------+

Choose a song by entering the corresponding number:
1: Старшие (feat. Polyana) by OG Buda from the album FREERIO
2: Freaky by Tor

In [8]:
# Get the cluster of the chosen song
chosen_cluster = df.filter((col("title") == chosen_song["title"]) &
                           (col("artist") == chosen_song["artist"]) &
                           (col("album") == chosen_song["album"])) \
                   .select("prediction").collect()[0][0]

# Show 12 songs from the same cluster
print(f"Recommended playlist based on your choice of '{chosen_song['title']}' by {chosen_song['artist']}:")
recommended_songs = df.filter(col("prediction") == chosen_cluster).select("title", "artist", "album").limit(12)
recommended_songs.show(truncate=False)

Recommended playlist based on your choice of 'Leave Before You Love Me (with Jonas Brothers)' by Marshmello:
+--------------------------------------------------------------------------------+---------------------------+------------------------------------------------------------------+
|title                                                                           |artist                     |album                                                             |
+--------------------------------------------------------------------------------+---------------------------+------------------------------------------------------------------+
|Traicionera                                                                     |Sebastian Yatra            |Traicionera                                                       |
|Dile Que Tu Me Quieres                                                          |Ozuna                      |Dile Que Tu Me Quieres                                            |
|