## 1) Setup Spark + Drive

In [1]:
!pip -q install pyspark

from google.colab import drive
drive.mount('/content/drive')

from pyspark.sql import SparkSession, functions as F, types as T

spark = SparkSession.builder.appName("SpotifyPreprocessing").getOrCreate()

BRONZE = "file:///content/drive/MyDrive/data/spotify/bronze_parquet"  # from 01_Data_Ingestion.ipynb
SILVER = "file:///content/drive/MyDrive/data/spotify/silver_parquet"  # output of this notebook

df = spark.read.parquet(BRONZE)
print("Rows:", df.count())
df.printSchema()
df.show(5, truncate=False)

Mounted at /content/drive
Rows: 114000
root
 |-- Unnamed: 0: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)

+----------+----------------------+----------------------+---------------------------------

## 2) Column plan

In [2]:
# 1) Define column groups
id_cols = ["track_id"]
cat_cols = ["artists", "album_name", "track_name", "track_genre", "key"]  # 'key' is musical key 0-11; keep as string or cast to int below
bool_cols = ["explicit"]
num_cols_as_string = [
    "popularity","duration_ms","danceability","energy","loudness","mode",
    "speechiness","acousticness","instrumentalness","liveness","valence",
    "tempo","time_signature"
]
# present but not needed
drop_cols = ["Unnamed: 0"]  # index column from the source

# 2) Drop junk, trim strings, normalize empties to null
df1 = df.drop(*[c for c in drop_cols if c in df.columns])

def trim_all_strings(df_in):
    return df_in.select([
        F.trim(F.col(c)).alias(c) if t == T.StringType() else F.col(c)
        for c, t in df_in.dtypes
    ])

# Convert empty strings to nulls for all string columns
def empty_to_null(df_in):
    string_cols = [f.name for f in df_in.schema.fields if isinstance(f.dataType, T.StringType)]
    exprs = [F.when(F.length(F.col(c)) == 0, None).otherwise(F.col(c)).alias(c) if c in string_cols else F.col(c)
             for c in df_in.columns]
    return df_in.select(*exprs)

df1 = trim_all_strings(df1)
df1 = empty_to_null(df1)

# 3) Safe cast helpers (coerce bad values to null)
def safe_cast(col, new_type):
    return F.when(F.col(col).cast(new_type).isNotNull(), F.col(col).cast(new_type)).otherwise(None)

# 4) Booleans and numerics
df2 = df1

# explicit: string -> boolean (True/False)
if "explicit" in df2.columns:
    df2 = df2.withColumn("explicit",
                         F.when(F.lower(F.col("explicit")).isin("true","1","t","yes"), F.lit(True))
                          .when(F.lower(F.col("explicit")).isin("false","0","f","no"), F.lit(False))
                          .otherwise(None).cast(T.BooleanType()))

# Cast numeric-looking columns
for c in num_cols_as_string:
    if c in df2.columns:
        # pick integer types when appropriate
        if c in ["popularity", "duration_ms", "mode", "time_signature"]:
            df2 = df2.withColumn(c, safe_cast(c, T.IntegerType()))
        elif c in ["tempo", "loudness", "danceability","energy","speechiness","acousticness","instrumentalness","liveness","valence"]:
            df2 = df2.withColumn(c, safe_cast(c, T.DoubleType()))
        else:
            df2 = df2.withColumn(c, safe_cast(c, T.DoubleType()))

# 'key' & 'mode' are often integers; cast 'key' to int if present as string
if "key" in df2.columns and dict(df2.dtypes)["key"] == "string":
    df2 = df2.withColumn("key", safe_cast("key", T.IntegerType()))

df2.printSchema()
df2.show(5, truncate=False)

root
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- track_genre: string (nullable = true)

+----------------------+----------------------+------------------------------------------------------+--------------------------+----------+-----------+--------+------

## 3)  De-duplication & basic filters

In [3]:
# Drop exact duplicates
before = df2.count()
df3 = df2.dropDuplicates(id_cols + ["album_name", "track_name"]) if "track_id" in df2.columns else df2.dropDuplicates()
print("Dropped duplicates:", before - df3.count())

# Optional: remove rows missing essential identifiers or genre
essential = ["track_id", "track_name", "track_genre"]
for c in essential:
    if c not in df3.columns:
        print(f"Warning: missing expected column {c}")
df3 = df3.where(F.col("track_id").isNotNull() & F.col("track_name").isNotNull() & F.col("track_genre").isNotNull())

# Optional sanity filters on obvious bad values
df3 = df3.where((F.col("duration_ms").isNull()) | (F.col("duration_ms") > 0))

Dropped duplicates: 24259


## 4) Missing values — median imputation per genre

In [4]:
from functools import reduce

target_num_cols = [
    "popularity","duration_ms","danceability","energy","loudness","mode",
    "speechiness","acousticness","instrumentalness","liveness","valence",
    "tempo","time_signature","key"
]
target_num_cols = [c for c in target_num_cols if c in df3.columns]

# compute per-genre medians
per_genre_exprs = []
for c in target_num_cols:
    per_genre_exprs.append(F.expr(f"percentile_approx({c}, 0.5) as med_{c}"))

per_genre = df3.groupBy("track_genre").agg(*per_genre_exprs)

# compute global medians (fallbacks)
global_meds = df3.agg(*[F.expr(f"percentile_approx({c}, 0.5) as med_{c}") for c in target_num_cols]).collect()[0].asDict()

# join medians to rows
df4 = df3.join(per_genre, on="track_genre", how="left")

# fill null numeric values with per-genre median then with global median
for c in target_num_cols:
    med_col = f"med_{c}"
    if med_col in df4.columns:
        df4 = df4.withColumn(
            c,
            F.when(F.col(c).isNull(), F.col(med_col)).otherwise(F.col(c))
        )
        # global fallback
        df4 = df4.fillna({c: float(global_meds[med_col]) if global_meds[med_col] is not None else None})

# drop helper median columns
df4 = df4.drop(*[f"med_{c}" for c in target_num_cols if f"med_{c}" in df4.columns])

## 5) Outliers — IQR capping per genre

In [5]:
# compute per-genre Q1, Q3
iqr_stats = df4.groupBy("track_genre").agg(
    *[
        F.expr(f"percentile_approx({c}, 0.25) as q1_{c}") if c in target_num_cols else F.lit(None).alias(f"q1_{c}")
        for c in target_num_cols
    ],
    *[
        F.expr(f"percentile_approx({c}, 0.75) as q3_{c}") if c in target_num_cols else F.lit(None).alias(f"q3_{c}")
        for c in target_num_cols
    ]
)

df5 = df4.join(iqr_stats, on="track_genre", how="left")

for c in target_num_cols:
    q1c, q3c = f"q1_{c}", f"q3_{c}"
    if q1c in df5.columns and q3c in df5.columns:
        df5 = df5.withColumn(f"iqr_{c}", F.col(q3c) - F.col(q1c))
        low = F.col(q1c) - 1.5 * F.col(f"iqr_{c}")
        high = F.col(q3c) + 1.5 * F.col(f"iqr_{c}")
        df5 = df5.withColumn(c, F.when(F.col(c) < low, low)
                                   .when(F.col(c) > high, high)
                                   .otherwise(F.col(c)))

# drop helper columns
drop_helpers = []
for c in target_num_cols:
    drop_helpers += [f"q1_{c}", f"q3_{c}", f"iqr_{c}"]
df5 = df5.drop(*[h for h in drop_helpers if h in df5.columns])

## 6) Normalization — Z-score for numerics

In [6]:
cont_cols = [c for c in target_num_cols if c in df5.columns and c not in ["mode","time_signature","key"]]  # treat these as categorical/integer

# compute global means/stds (you can also do per-genre if needed)
stats = df5.agg(
    *[F.mean(c).alias(f"mean_{c}") for c in cont_cols],
    *[F.stddev_pop(c).alias(f"std_{c}") for c in cont_cols]
).collect()[0].asDict()

df6 = df5
for c in cont_cols:
    mean_c = stats.get(f"mean_{c}", None)
    std_c  = stats.get(f"std_{c}", None)
    if mean_c is not None and std_c is not None and std_c != 0:
        df6 = df6.withColumn(f"{c}_z", (F.col(c) - F.lit(float(mean_c))) / F.lit(float(std_c)))

df6.select(*(id_cols + ["track_name","track_genre"] + cont_cols + [f"{c}_z" for c in cont_cols])).show(5, truncate=False)

+----------------------+----------------------------------------+--------------+----------+-----------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+-------------------+--------------------+---------------------+----------------------+----------------------+----------------------+---------------------+----------------------+----------------------+--------------------+-------------------+
|track_id              |track_name                              |track_genre   |popularity|duration_ms|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |popularity_z       |duration_ms_z       |danceability_z       |energy_z              |loudness_z            |speechiness_z         |acousticness_z       |instrumentalness_z    |liveness_z            |valence_z           |tempo_z            |
+----------------------+----------------------------------------+--------------+----------+-----------+------------+--

## 7) Save silver dataset to Drive (Parquet)

In [8]:
# Write cleaned & normalized dataset
df6.write.mode("overwrite").parquet(SILVER)

# Quick reload & peek to verify
df_silver = spark.read.parquet(SILVER)
print("Silver rows:", df_silver.count())
df_silver.printSchema()
df_silver.show(5, truncate=False)

Silver rows: 89720
root
 |-- track_genre: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- popularity_z: double (nullable = true)
 |-- duration_ms_z: double (nullable = true)
 |-- danceability_z: double (nullable = true)
 |-- energy_z: doub