# Spotify 2023 — Top 5 Tracks by Streams
This notebook reads `spotify-2023.csv` with robust CSV options, cleans the `streams` column (handles commas and K/M/B suffixes), and computes the top 5 tracks by total streams.

If you prefer a quick local run, the last cell provides a small pandas alternative.

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, regexp_extract, when, trim, desc, sum as spark_sum


spark = SparkSession.builder.appName("Spotify Data").getOrCreate()

spark.sparkContext.setLogLevel("WARN")


In [None]:
path = "../spotify-2023.csv"
df = (
    spark.read
         .option("header", "true")
         .option("sep", ",")
         .option("quote", '"')
         .option("escape", '"')
         .option("multiLine", "true")
         .csv(path)
)

print("Columns:", df.columns)
df.printSchema()
df.show(5, truncate=False)


Columns: ['track_name', 'artist(s)_name', 'artist_count', 'released_year', 'released_month', 'released_day', 'in_spotify_playlists', 'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm', 'key', 'mode', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']
root
 |-- track_name: string (nullable = true)
 |-- artist(s)_name: string (nullable = true)
 |-- artist_count: string (nullable = true)
 |-- released_year: string (nullable = true)
 |-- released_month: string (nullable = true)
 |-- released_day: string (nullable = true)
 |-- in_spotify_playlists: string (nullable = true)
 |-- in_spotify_charts: string (nullable = true)
 |-- streams: string (nullable = true)
 |-- in_apple_playlists: string (nullable = true)
 |-- in_apple_charts: string (nullable = true)
 |-- in_deezer_playlists: string (nullable = true)
 |-- in_deezer_charts: string (null

In [None]:
from pyspark.sql.functions import col, regexp_replace, regexp_extract, when, trim

df_with_streams = (
    df
    .withColumn("streams_str", regexp_replace(col("streams"), ",", ""))
    .withColumn("streams_str", regexp_replace(col("streams_str"), " ", ""))
    .withColumn("streams_str", regexp_replace(col("streams_str"), r"[^0-9A-Za-z.]", ""))
    .withColumn("num_part", regexp_extract(col("streams_str"), r"([0-9]*\.?[0-9]+)", 1))
    .withColumn("suffix", regexp_extract(col("streams_str"), r"([KMB])", 1))
    .withColumn(
        "streams_num",
        when(col("suffix")=="K", col("num_part").cast("double")*1_000)
        .when(col("suffix")=="M", col("num_part").cast("double")*1_000_000)
        .when(col("suffix")=="B", col("num_part").cast("double")*1_000_000_000)
        .otherwise(col("num_part").cast("double"))
    )
    .withColumn("track_name", trim(col("track_name")))
)

print("Rows with NULL streams_num (parsing failures) — sample 20:")
df_with_streams.filter(col("streams_num").isNull()).select("track_name", "streams", "streams_str").show(20, truncate=False)

df_clean = (
    df_with_streams.filter(col("streams_num").isNotNull())
    .withColumnRenamed("streams_num", "streams")
    .drop("streams_str", "num_part", "suffix")
)

df_clean.printSchema()
df_clean.select("track_name", "artist(s)_name", "streams").show(10, truncate=False)


{"ts": "2025-10-28 15:36:49.288", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[AMBIGUOUS_REFERENCE] Reference `streams` is ambiguous, could be: [`streams`, `streams`]. SQLSTATE: 42704", "context": {"file": "jdk.internal.reflect.GeneratedMethodAccessor54.invoke(Unknown Source)", "line": "", "fragment": "col", "errorClass": "AMBIGUOUS_REFERENCE"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o981.select.\n: org.apache.spark.sql.AnalysisException: [AMBIGUOUS_REFERENCE] Reference `streams` is ambiguous, could be: [`streams`, `streams`]. SQLSTATE: 42704\n\tat org.apache.spark.sql.errors.QueryCompilationErrors$.ambiguousReferenceError(QueryCompilationErrors.scala:2163)\n\tat org.apache.spark.sql.catalyst.expressions.package$AttributeSeq.resolve(package.scala:356)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveChildren(LogicalPlan.scala:164)\n\tat org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.$anon

Rows with NULL streams_num (parsing failures) — sample 20:
+----------+-------+-----------+
|track_name|streams|streams_str|
+----------+-------+-----------+
+----------+-------+-----------+

root
 |-- track_name: string (nullable = true)
 |-- artist(s)_name: string (nullable = true)
 |-- artist_count: string (nullable = true)
 |-- released_year: string (nullable = true)
 |-- released_month: string (nullable = true)
 |-- released_day: string (nullable = true)
 |-- in_spotify_playlists: string (nullable = true)
 |-- in_spotify_charts: string (nullable = true)
 |-- streams: string (nullable = true)
 |-- in_apple_playlists: string (nullable = true)
 |-- in_apple_charts: string (nullable = true)
 |-- in_deezer_playlists: string (nullable = true)
 |-- in_deezer_charts: string (nullable = true)
 |-- in_shazam_charts: string (nullable = true)
 |-- bpm: string (nullable = true)
 |-- key: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- danceability_%: string (nullable = true)


AnalysisException: [AMBIGUOUS_REFERENCE] Reference `streams` is ambiguous, could be: [`streams`, `streams`]. SQLSTATE: 42704

In [None]:
# Compute top 5 tracks by total streams.
# Group by track_name and artist(s)_name to avoid merging different tracks with same name by different artists.
top5 = (
    df_clean.groupBy("track_name", "artist(s)_name")
            .agg(spark_sum("streams").alias("total_streams"))
            .orderBy(desc("total_streams"))
            .limit(5)
)

top5.show(truncate=False)

In [35]:
# Pandas fallback (quick local check if the file is small).
# Use this when you want faster iteration locally.
import pandas as pd
pdf = pd.read_csv(path)
print(pdf.columns.tolist())
pdf['streams_clean'] = (pdf['streams']
    .astype(str)
    .str.replace(',', '', regex=False)
    .str.replace(' ', '', regex=False)
    )
# try to handle K/M/B suffixes in pandas if present
def expand_stream(s):
    import re
    if pd.isna(s):
        return None
    m = re.match(r'([0-9]*\.?[0-9]+)([KMB]?)', str(s))
    if not m:
        return None
    num = float(m.group(1))
    suf = m.group(2)
    if suf == 'K':
        return int(num * 1_000)
    if suf == 'M':
        return int(num * 1_000_000)
    if suf == 'B':
        return int(num * 1_000_000_000)
    return int(num)

pdf['streams_num'] = pdf['streams_clean'].map(expand_stream)
top5_pdf = (pdf.groupby(['track_name', 'artist(s)_name'], as_index=False)['streams_num']
             .sum()
             .sort_values('streams_num', ascending=False)
             .head(5))
print(top5_pdf)

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 7250-7251: invalid continuation byte

25/10/28 19:21:14 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 903953 ms exceeds timeout 120000 ms
25/10/28 19:21:14 WARN SparkContext: Killing executors is not supported by current scheduler.
25/10/28 19:21:19 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:81)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:669)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1296)
	at o