In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, regexp_extract, when, trim, desc, sum as spark_sum, expr, split, explode
import pandas as pd
import numpy as np
from pyspark.sql.types import IntegerType,LongType
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Spotify Data").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/29 11:24:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Instructions:

### Big Task 1
Task 1 - top 5 tracks by streams
 - Run import cell(1)
 - Run Read CSV cell(2)
 - Run Streams_int column cell(4)
 - Run Sort and output cell(5)

Task 2 - Top 5 songs across all platforms
 - Run imports cell(1)
 - Run CSV Read cell(2)
 - Run integer conversion cell(3)
 - Run Sort and output Cell(6)

For this i used a potentially crude method of finding out the rating across the charts. A lot of the songs have a rank of 0, which im assuming means it didn't reach the chart. These 0s will throw an average if included as its incorrect and invalid value lowering the avg by a large percentage per 0, (25, 33, 50, 100 for 1-4 0s respectivley). Because of this I have to find a way to remove this 0s from the value and the divisor or create a way to acknowledge something didn't chart. Instead, I used the chart position of 1000 as a replacement - as we are only looking at the top 1% of the table, this will remove the effect of 0s and replace it with the lowest possible ranking on the chart to signal it didn't chart. This removed our 0 values for records, preventing uncharted songs from ranking top here.

I don't see any crossover between the songs between the outpusts - which is suprising as you would assume highest streaming = highest charting.

### Big Task 2

Task - Artist with highest number of spotify charts
 - Run imports cell(1)
 - Run CSV read cell(2)
 - Run DF clean cell(3)
 - Run Artist DF clean(8)
 - Run print artists(9)

### Big Task 3

Task - Find the tracks with danceability over 80
 - Run imports cell(1)
 - Run CSV read cell(2)
 - Run SQL query cell(10)

We can see that Higher BPMs have a strong correlation with danceability

In [6]:
path = "../spotify-2023.csv"
df = (
    spark.read
         .option("header", "true")
         .option("sep", ",")
         .option("quote", '"')
         .option("escape", '"')
         .option("multiLine", "true")
         .csv(path)
)

df =df.withColumnRenamed("artist(s)_name", "artists")
df =df.withColumnRenamed("danceability_%", "danceability")

In [28]:
def to_long_safe(x):
    try:
        return int(x)
    except (ValueError, TypeError):
        return 0
    
def uncharted(x):
    if int(x) == 0:
        return 1000
    else:
        return int(x)

    

df = (df
      .withColumn("in_spotify_charts", expr("try_cast(in_spotify_charts as bigint)"))
      .withColumn("in_apple_charts", expr("try_cast(in_apple_charts as bigint)"))
      .withColumn("in_deezer_charts", expr("try_cast(in_deezer_charts as bigint)"))

)

to_int_udf = F.udf(to_long_safe, LongType())
uncharted_udf = F.udf(uncharted, LongType())


df = df.withColumn("in_apple_charts", uncharted_udf(F.col("in_apple_charts")))
df = df.withColumn("in_deezer_charts", uncharted_udf(F.col("in_deezer_charts")))
df = df.withColumn("in_spotify_charts", uncharted_udf(F.col("in_spotify_charts")))

df = df.withColumn("streams_int", to_int_udf(F.col("streams")))


df = df.withColumn("overall_chart_rating", 
                   col("in_spotify_charts")
                   + col("in_apple_charts")
                   + col("in_deezer_charts")
                   )

In [13]:
df = df.withColumn("streams_int", to_int_udf(F.col("streams")))

In [None]:
df_tracks = df.sort(df.streams_int.desc())
df_tracks.show(5)

In [None]:
df_charts = df.sort(df.overall_chart_rating.asc())
df_charts.show(5)


+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+-----------+----------------+
|          track_name|      artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|  streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm|key| mode|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|streams_int|avg_chart_rating|
+--------------------+--------------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+----

In [None]:
## This doesnt work, only counts for number of songs(not chart placements) as doesnt take into account featuring artists

df.createOrReplaceTempView("people")
result = spark.sql("SELECT artists, count(*) as total from people GROUP BY artists ORDER BY total DESC;")
result.show()

+----------------+-----+
|         artists|total|
+----------------+-----+
|    Taylor Swift|   34|
|      The Weeknd|   22|
|       Bad Bunny|   19|
|             SZA|   19|
|    Harry Styles|   17|
|  Kendrick Lamar|   12|
|   Morgan Wallen|   11|
|      Ed Sheeran|    9|
|            Feid|    8|
|Drake, 21 Savage|    8|
|             BTS|    8|
|        Labrinth|    7|
|  Olivia Rodrigo|    7|
|        Doja Cat|    6|
|        NewJeans|    6|
|   Billie Eilish|    5|
|         Quevedo|    4|
|         Karol G|    4|
|             IVE|    4|
|           Adele|    4|
+----------------+-----+
only showing top 20 rows


In [4]:
df_artists = (
    df
    .withColumn("in_spotify_charts_i", expr("try_cast(in_spotify_charts as int)"))
    .filter((col("in_spotify_charts_i") >= 1) & (col("in_spotify_charts_i") <= 1000))
    .withColumn("artist", explode(split(col("artists"), ",")))
    .withColumn("artist", trim(col("artist")))
)

artist_counts = (
    df_artists
    .groupBy("artist")
    .agg(F.count("*").alias("chart_appearances"))
    .orderBy(desc("chart_appearances"))
)

In [5]:
artist_counts.show(10)

+--------------+-----------------+
|        artist|chart_appearances|
+--------------+-----------------+
|     Bad Bunny|               30|
|  Taylor Swift|               23|
|    The Weeknd|               18|
|    Peso Pluma|               16|
|          Feid|               15|
|       Quevedo|               11|
|Rauw Alejandro|               11|
|       Karol G|               10|
|  Harry Styles|               10|
|  Metro Boomin|                9|
+--------------+-----------------+
only showing top 10 rows


In [None]:
df.createOrReplaceTempView("people")
result = spark.sql("SELECT track_name, danceability, bpm FROM people WHERE danceability > 79 Order by bpm DESC")
result.show()

+--------------------+------------+---+
|          track_name|danceability|bpm|
+--------------------+------------+---+
|Popular (with Pla...|          85| 99|
|             Hey Mor|          90| 98|
|              Efecto|          80| 98|
|        Dance Monkey|          82| 98|
|           Pantysito|          83| 98|
|               Party|          83| 97|
|                POP!|          80| 97|
|       En La De Ella|          82| 97|
|            Gasolina|          86| 96|
|        Shape of You|          83| 96|
|        Shorty Party|          93| 96|
|Una Noche en Medell�|          87| 96|
|         Rich Spirit|          85| 96|
|Yo Voy (feat. Dad...|          81| 95|
|       Es un Secreto|          84| 95|
|    The Next Episode|          92| 95|
|             Excuses|          84| 95|
|Feliz Cumplea��os Fe|          87| 95|
|       Gato de Noche|          89| 94|
|   Mas Rica Que Ayer|          82| 94|
+--------------------+------------+---+
only showing top 20 rows


25/10/29 13:20:34 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 478672 ms exceeds timeout 120000 ms
25/10/29 13:20:34 WARN SparkContext: Killing executors is not supported by current scheduler.
25/10/29 13:20:36 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$