In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
df_streaming = pd.read_csv("streams_data.csv")
df_tracks = pd.read_csv("tracks_data.csv")
df_albums = pd.read_csv("albums_data.csv")

# Data processing

## Remove skips

We define a "skip" as:
- Song played for less than 30 seconds (based on `duration_ms`), AND
- Stream ended due to a deliberate action (based on `reason_end`)

In [3]:
df_streaming = df_streaming[
    (df_streaming['ms_played'] > 30_000) &
    (~df_streaming['reason_end'].isin(['fwdbtn', 'backbtn', 'logout']))
].reset_index(drop=True)

## Get additional fields

We combine the data on streams with track and album specific data as obtained from the Spotify API. We also create some intermediate datetime specific fields.

This additional information will be useful later.

In [4]:
df = df_streaming.merge(df_tracks, on='track_id', how='left').merge(df_albums, on='album_id', how='left')
df['datetime'] = pd.to_datetime(df['ts']).dt.tz_convert('Singapore')
df['date'] = df['datetime'].dt.date
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.year.astype(str) + '-' + df['datetime'].dt.month.astype(str).str.zfill(2)
df['hour'] = df['datetime'].dt.hour

## Streaming data example (5 most recent streams)

In [5]:
df.tail()

Unnamed: 0,ts,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,ms_played,reason_start,reason_end,skipped,track_id,album_id,duration_ms,track_popularity,release_date,release_date_precision,album_popularity,image_url,datetime,date,year,month,hour
33484,2025-01-08T08:59:27Z,Supernova,aespa,Armageddon - The 1st Album,178880,trackdone,trackdone,False,2VdSktBqFfkW7y6q5Ik4Z4,3gHhPm8z8tid1kvpniUKuK,178880,70,2024-05-27,day,66,https://i.scdn.co/image/ab67616d0000b2730fc598...,2025-01-08 16:59:27+08:00,2025-01-08,2025,2025-01,16
33485,2025-01-08T09:02:21Z,1-800-hot-n-fun,LE SSERAFIM,CRAZY,173431,fwdbtn,trackdone,False,7vjfnsnDXZGK4PSq54ISjc,538vEfAgLJ6g2I8ubuOlap,173431,73,2024-08-30,day,71,https://i.scdn.co/image/ab67616d0000b273485623...,2025-01-08 17:02:21+08:00,2025-01-08,2025,2025-01,17
33486,2025-01-08T09:12:26Z,Imaginary Friend,ITZY,GOLD,604306,trackdone,trackdone,False,06BeeZaS4YArThfTMu80QS,4CRfobFgSG0GOzplgTI79s,202429,66,2024-10-15,day,62,https://i.scdn.co/image/ab67616d0000b273dd7346...,2025-01-08 17:12:26+08:00,2025-01-08,2025,2025-01,17
33487,2025-01-08T09:15:16Z,Magnetic,ILLIT,SUPER REAL ME,160688,backbtn,trackdone,False,1aKvZDoLGkNMxoRYgkckZG,6irebIc6UO8fN0jl4UlzBS,160688,82,2024-03-25,day,73,https://i.scdn.co/image/ab67616d0000b273f037c5...,2025-01-08 17:15:16+08:00,2025-01-08,2025,2025-01,17
33488,2025-01-08T09:17:34Z,Igloo,KISS OF LIFE,Lose Yourself,131786,fwdbtn,trackdone,False,2DbDefRFJ5YOfXCKOeCJJh,4eguh1dJUXRh0IMiLKRwab,131786,82,2024-10-15,day,71,https://i.scdn.co/image/ab67616d0000b27315175a...,2025-01-08 17:17:34+08:00,2025-01-08,2025,2025-01,17


In [6]:
df[(df['master_metadata_track_name'] == 'Drama') & (df['master_metadata_album_artist_name'] == 'aespa')]['image_url'].reset_index(drop=True)[0]

'https://i.scdn.co/image/ab67616d0000b273c54e39f2ae0dd10731f93c08'

## Establish time period
2019 and 2025 have data but are incomplete years, so we remove them for this analysis


In [42]:
df = df[(df['year'] > 2019) & (df['year'] < 2025)].reset_index(drop=True)

# Music insights

## Overall

### Total listening time
You spent X minutes with us in 2024.
There were ups and downs over the past 5 years, but we're glad to have you with us!

In [49]:
df_listening_time = df.groupby('year')['ms_played'].sum().reset_index()
df_listening_time['mins_played'] = df_listening_time['ms_played'] / 1_000 / 60
df_listening_time

Unnamed: 0,year,ms_played,mins_played
0,2020,821825676,13697.0946
1,2021,1210489960,20174.832667
2,2022,641149087,10685.818117
3,2023,1429597995,23826.63325
4,2024,1433743521,23895.72535


### Track and Artist count
This year, you enjoyed X songs from Y artists.

In [52]:
df_track_artist_count = df.groupby('year')[['track_id', 'master_metadata_album_artist_name']].nunique().reset_index()
df_track_artist_count

Unnamed: 0,year,track_id,master_metadata_album_artist_name
0,2020,785,251
1,2021,1597,467
2,2022,817,298
3,2023,1116,263
4,2024,1278,322


## Tracks

### Top 5 tracks
Your top 5 songs of 2024 are all recent jams! But there's so much more than just your top 5...

In [132]:
# Total appearances of each track
df_track_counts_yearly = df.groupby('year')['track_id'].value_counts().reset_index()

# Total playtime divided by track duration
df_track_plays_yearly = df.groupby(['year', 'track_id']).agg({
    'ms_played': 'sum',
    'duration_ms': 'mean'
}).reset_index()
df_track_plays_yearly['times_played'] = df_track_plays_yearly['ms_played'] / df_track_plays_yearly['duration_ms']

# Merge data to get track names
df_track_stats = pd.merge(df_track_counts_yearly, df_track_plays_yearly, on=['year', 'track_id'], how='left').merge(
    df[['track_id', 'master_metadata_track_name']].drop_duplicates(), on='track_id', how='left'
)

df_track_stats = df_track_stats[['year', 'master_metadata_track_name', 'count', 'times_played']]

# Get the top 5 tracks of each year
df_top_tracks = df_track_stats.sort_values(['year', 'count'], ascending=[True, False]).groupby('year').head(5).reset_index(drop=True)
df_top_tracks = df_top_tracks[df_top_tracks['year'] == 2024].reset_index(drop=True)
df_top_tracks

Unnamed: 0,year,master_metadata_track_name,count,times_played
0,2024,Drama,94,93.203137
1,2024,UNTOUCHABLE,93,92.021016
2,2024,WOKE UP,89,103.734727
3,2024,SHOOTING STAR,81,83.37598
4,2024,Supernova,71,76.154372


### Most played tracks from early 2000s
This year, you revisited some early 2000s classics...

In [61]:
# Get tracks played in 2024, but with release date before 2010
df_top_early_tracks = df[(df['year'] == 2024) & (df['release_date'] < '2010')]
df_top_early_tracks = df_top_early_tracks['track_id'].value_counts().reset_index()

# Merge data to get track names and release dates
df_top_early_tracks = df_top_early_tracks.merge(
    df[['track_id', 'master_metadata_track_name', 'release_date']].drop_duplicates(), on='track_id', how='left'
)
df_top_early_tracks.sort_values(['count', 'release_date'], ascending=[False, True]).head(5)

Unnamed: 0,track_id,count,master_metadata_track_name,release_date
0,1ErdaM9N7EJ7trhXFnDECg,20,I Don't Care,2009-07-08
1,5w18nowVMRZrC5Na9Vxoth,15,"쏘리 쏘리 Sorry, Sorry",2009-03-11
2,62bOmKYxYg7dhrC6gH9vFn,12,Bye Bye Bye - From Deadpool and Wolverine Soun...,2000-03-21
3,3IrkbGQCoEPAkzJ0Tkv8nm,10,Dirt Off Your Shoulder,2003-11-14
4,1mea3bSkSGXuIRvnydlB5b,10,Viva La Vida,2008-05-26


### Tracks first streamed on release day
While also welcoming the new! You joined 8.5 million people streaming 2024's biggest song on release day.

In [76]:
# Get the first streaming date for every song
df_track_first_listen = df[df['release_date_precision'] == 'day'].groupby('track_id')['date'].min().reset_index()

# Merge data to get track names and release dates
df_track_first_listen = df_track_first_listen.merge(
    df[['track_id', 'master_metadata_track_name', 'master_metadata_album_artist_name', 'release_date']].drop_duplicates(),
    on='track_id',
    how='left'
)

df_track_first_listen['date'] = pd.to_datetime(df_track_first_listen['date'])
df_track_first_listen['release_date'] = pd.to_datetime(df_track_first_listen['release_date'])
df_track_first_listen['release_year'] = df_track_first_listen['release_date'].dt.year

# Keep only tracks released after 2020 (start of tracking period)
df_track_first_listen = df_track_first_listen[df_track_first_listen['release_year'] >= 2020]

# Get difference between first streaming date and release date in days
df_track_first_listen['diff'] = (df_track_first_listen['date'] - df_track_first_listen['release_date']).dt.days

# Display tracks with at most 1 day difference
df_track_first_listen[
    (df_track_first_listen['release_year'] == 2024) &
    (df_track_first_listen['diff'] <= 1)
].sort_values(['master_metadata_track_name']).reset_index(drop=True).head()

Unnamed: 0,track_id,date,master_metadata_track_name,master_metadata_album_artist_name,release_date,release_year,diff
0,5vNRhkKd0yEAg8suGBpjeY,2024-10-19,APT.,ROSÉ,2024-10-18,2024,1
1,3OL3ZJ6YEJpTTxFENVK3L8,2024-03-08,Abracadabra,(G)I-DLE,2024-03-08,2024,0
2,3w9EpBR3YB4McNuyQ8wzMA,2024-03-08,Abracadabra - Instrumental,(G)I-DLE,2024-03-08,2024,0
3,5eWcGfUCrVFMoYskyfkEPE,2024-05-28,Armageddon,aespa,2024-05-27,2024,1
4,2jAE0rQ3K4dBa4fBjR4GcU,2024-10-16,BORN TO BE (Final Ver.),ITZY,2024-10-15,2024,1


## Artists

### Top 5 artists
Speaking of biggest... here are your top 5 artists of 2024!

In [112]:
# Total playtime
df_top_artists = df.groupby(['year', 'master_metadata_album_artist_name'])['ms_played'].sum().reset_index()

# Get the top 5 tracks of each year
df_top_artists = df_top_artists.sort_values(['year', 'ms_played'], ascending=[True, False]).groupby('year').head(5).reset_index(drop=True)
df_top_artists = df_top_artists[df_top_artists['year'] == 2024].reset_index(drop=True)
df_top_artists['mins_played'] = df_top_artists['ms_played'] / 1_000 / 60
df_top_artists

Unnamed: 0,year,master_metadata_album_artist_name,ms_played,mins_played
0,2024,ITZY,155774325,2596.23875
1,2024,IVE,108382764,1806.3794
2,2024,aespa,97865378,1631.089633
3,2024,XG,91817409,1530.29015
4,2024,IU,83410784,1390.179733


### Total artist listening time
You've been spending much more time with your favorite artist's bops. Your love for them has definitely grown!

In [93]:
# Select top artist of 2024
top_artist = df_top_artists['master_metadata_album_artist_name'][0]

# Get total listening time of top artist
df_top_artist_listening_time = df[df['master_metadata_album_artist_name'] == top_artist].groupby('year')['ms_played'].sum().reset_index()
df_top_artist_listening_time['mins_played'] = df_top_artist_listening_time['ms_played'] / 1_000 / 60
df_top_artist_listening_time

Unnamed: 0,year,ms_played,mins_played
0,2020,36432115,607.201917
1,2021,57341054,955.684233
2,2022,21106776,351.7796
3,2023,113717498,1895.291633
4,2024,155774325,2596.23875


### Concentration of listening time
37% of your total listening time is with your top 5 artists. Definitely significant, but you still left room for others!

In [104]:
# Select top artists of 2024
top_artists = df_top_artists['master_metadata_album_artist_name'].unique()

df_temp = df[df['year'] == 2024].copy()

# Get total listening time within top artists and without
df_top_artists_listening_time = pd.Series({
    'top': df_temp[df_temp['master_metadata_album_artist_name'].isin(top_artists)]['ms_played'].sum(),
    'others': df_temp[~df_temp['master_metadata_album_artist_name'].isin(top_artists)]['ms_played'].sum()
}).to_frame().rename(columns={0: 'ms_played'})

df_top_artists_listening_time['proportion'] = df_top_artists_listening_time['ms_played'] / df_temp['ms_played'].sum()

df_top_artists_listening_time

Unnamed: 0,ms_played,proportion
top,537250660,0.374719
others,896492861,0.625281


## Albums

### Top 5 albums
While we're still on your top 5, not forgetting, your top 5 albums of 2024!

In [121]:
# Total playtime
df_album_stats = df.groupby(['year', 'album_id'])['ms_played'].sum().reset_index()

# Merge data to get track names
df_album_stats = df_album_stats.merge(
    df[['album_id', 'master_metadata_album_album_name', 'master_metadata_album_artist_name']].drop_duplicates(), on='album_id', how='left'
)

df_album_stats = df_album_stats[['year', 'master_metadata_album_album_name', 'master_metadata_album_artist_name', 'ms_played']]

# Get the top 5 albums of each year
df_top_albums = df_album_stats.sort_values(['year', 'ms_played'], ascending=[True, False]).groupby('year').head(5).reset_index(drop=True)
df_top_albums = df_top_albums[df_top_albums['year'] == 2024].reset_index(drop=True)
df_top_albums['mins_played'] = df_top_albums['ms_played'] / 1_000 / 60
df_top_albums

Unnamed: 0,year,master_metadata_album_album_name,master_metadata_album_artist_name,ms_played,mins_played
0,2024,BORN TO BE,ITZY,47090349,784.83915
1,2024,Armageddon - The 1st Album,aespa,29529942,492.1657
2,2024,I've IVE,IVE,28683152,478.052533
3,2024,SHOOTING STAR,XG,26576968,442.949467
4,2024,GOLD,ITZY,26153550,435.8925


### Concentration of plays across album tracks
In these albums, your listening is definitely carried by specific bops.

In [None]:
# Select top album of 2024 (actually second place for more diversity)
top_album = df_top_albums['master_metadata_album_album_name'][1]

# Get total listening time of top artist
df_top_album_listening_time = df[df['master_metadata_album_album_name'] == top_album].groupby('master_metadata_track_name')['ms_played'].value_counts()
df_top_album_listening_time = df_top_album_listening_time.sort_values('ms_played', ascending=False).reset_index(drop=True)
df_top_album_listening_time['mins_played'] = df_top_album_listening_time['ms_played'] / 1_000 / 60
df_top_album_listening_time

Unnamed: 0,master_metadata_track_name,ms_played,mins_played
8,Supernova,13622494,227.041567
0,Armageddon,10939965,182.33275
5,Mine,1352771,22.546183
2,Licorice,952227,15.87045
7,Set The Tone,607599,10.12665
4,Long Chat (#♥),587358,9.7893
1,BAHAMA,570446,9.507433
3,Live My Life,474048,7.9008
6,Prologue,423034,7.050567


## Listening patterns

### Listening peaks
Your listening peaks around concert months. Definitely getting into the concert mood, it seems!

In [155]:
def get_artist_listening_time(selected_artist:str):
    # Get selected artist
    df_artist_listening_time = df[(df['master_metadata_album_artist_name'] == selected_artist) & (df['year'] == 2024)]

    # Get monthly playtime
    df_artist_listening_time = df_artist_listening_time.groupby('month')['ms_played'].sum().reset_index()

    # Fill in missing months where there are no plays
    df_artist_listening_time = pd.DataFrame({'month': ['2024-' + str(i).zfill(2) for i in range(1, 12+1)]}).merge(
        df_artist_listening_time, on='month', how='left'
    )
    df_artist_listening_time = df_artist_listening_time.fillna(0)

    df_artist_listening_time['ms_played'] = df_artist_listening_time['ms_played'].astype(int)
    df_artist_listening_time['mins_played'] = df_artist_listening_time['ms_played'] / 1_000 / 60
    
    return df_artist_listening_time

In [156]:
get_artist_listening_time(selected_artist='IU')

Unnamed: 0,month,ms_played,mins_played
0,2024-01,0,0.0
1,2024-02,0,0.0
2,2024-03,2976096,49.6016
3,2024-04,39719499,661.99165
4,2024-05,17373116,289.551933
5,2024-06,3732773,62.212883
6,2024-07,437385,7.28975
7,2024-08,1794875,29.914583
8,2024-09,10619167,176.986117
9,2024-10,2888051,48.134183


In [157]:
get_artist_listening_time(selected_artist='aespa')

Unnamed: 0,month,ms_played,mins_played
0,2024-01,2447643,40.79405
1,2024-02,1863799,31.063317
2,2024-03,4138807,68.980117
3,2024-04,1655301,27.58835
4,2024-05,4450764,74.1794
5,2024-06,9880426,164.673767
6,2024-07,24302697,405.04495
7,2024-08,14416673,240.277883
8,2024-09,3800157,63.33595
9,2024-10,6483978,108.0663


### Most played artists by time of day
Your days are definitely energized by ITZY's and IVE's lively bops... While IU's ballads are your late night and early morning companions.

In [163]:
# A quarter is defined as a period of 6 hours
df_temp = df[['year', 'master_metadata_album_artist_name', 'hour']].copy()
df_temp['quarter'] = df_temp['hour'] // 6 + 1

# Get top artists per quarter
df_top_artists_timeofday = df_temp.groupby(['year', 'quarter'])['master_metadata_album_artist_name'].value_counts().reset_index()
df_top_artists_timeofday = df_top_artists_timeofday.sort_values(['year', 'quarter', 'count'], ascending=[True, True, False])
df_top_artists_timeofday = df_top_artists_timeofday.groupby(['year', 'quarter']).head(2)

# Show explanation for each quarter
display(df.groupby('quarter')['hour'].agg(['min', 'max']))

# Get top artists per quarter for 2024
df_top_artists_timeofday = df_top_artists_timeofday[df_top_artists_timeofday['year'] == 2024].reset_index(drop=True)
df_top_artists_timeofday

Unnamed: 0_level_0,min,max
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,5
2,6,11
3,12,17
4,18,23


Unnamed: 0,year,quarter,master_metadata_album_artist_name,count
0,2024,1,Taylor Swift,53
1,2024,1,IU,37
2,2024,2,IU,62
3,2024,2,IVE,58
4,2024,3,ITZY,315
5,2024,3,aespa,132
6,2024,4,ITZY,459
7,2024,4,IVE,446


Thank you for an amazing 2024.

2025 is waiting - with more comebacks, more fandom moments, and hopefully even more concerts!