### Import packages

In [1]:
import os
import itertools

import pandas as pd

import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

pio.renderers.default = 'notebook_connected'    # other options include 'browser'
pio.templates.default = "plotly_dark"

### Import data
- library data from `YourLibrary.json` as one file
- streaming history from `StreamingHistory{i}.json` as multiple files
    - there are 11 `StreamingHistory{i}.json` files in this example (i = 0 to 10)
    - contain data from late 2020 to late December 2024

In [2]:
# read library
your_library = pd.read_json('YourLibrary.json', orient='index') # Index(['tracks', 'albums', 'shows', 'episodes', 'bannedTracks', 'artists', 'bannedArtists', 'other'])
your_library

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2445,2446,2447,2448,2449,2450,2451,2452,2453,2454
tracks,"{'artist': 'SCANDAL', 'album': 'SAKURA グッバイ', ...","{'artist': 'Kinokoteikoku', 'album': 'フェイクワールド...","{'artist': 'L'Arc-en-Ciel', 'album': 'GOOD LUC...","{'artist': 'Gackt', 'album': 'LAST MOON', 'tra...","{'artist': 'The Burning Deadwoods', 'album': '...","{'artist': 'Apink', 'album': 'ONE & SIX', 'tra...","{'artist': 'Aimyon', 'album': '青春のエキサイトメント', '...","{'artist': 'Kay Tse', 'album': '靜夜歌', 'track':...","{'artist': 'Yan Ting', 'album': 'Hands Up', 't...","{'artist': 'LENI', 'album': 'Not Over', 'track...",...,"{'artist': 'My Hair is Bad', 'album': 'woman’s...","{'artist': 'SCANDAL', 'album': 'マスターピース / まばたき...","{'artist': 'CHANYEOL', 'album': 'Guardian (Ori...","{'artist': 'BLACKPINK', 'album': 'SQUARE UP', ...","{'artist': 'OFFICIAL HIGE DANDISM', 'album': '...","{'artist': 'TWICE', 'album': 'Taste of Love', ...","{'artist': 'L'Arc-en-Ciel', 'album': 'DAYBREAK...","{'artist': 'CHANMINA', 'album': 'Mirror', 'tra...","{'artist': 'SID', 'album': '嘘', 'track': '日傘',...","{'artist': 'SCANDAL', 'album': 'Encore Show', ..."
albums,"{'artist': 'Mr.', 'album': 'We R Mr.', 'uri': ...","{'artist': 'SCANDAL', 'album': 'Kiss from the ...","{'artist': 'DIR EN GREY', 'album': 'The Insula...","{'artist': 'SCANDAL', 'album': 'R-GIRL's ROCK!...","{'artist': 'SCANDAL', 'album': 'TEMPTATION　BOX...","{'artist': 'AKAIKO-EN', 'album': 'The Park', '...","{'artist': 'Yerin Baek', 'album': 'Pisces', 'u...","{'artist': 'Choco', 'album': 'Everybody Listen...","{'artist': 'Sakura Fujiwara', 'album': 'SUPERM...","{'artist': 'Hump Back', 'album': '大阪城ホール単独公演”拝...",...,,,,,,,,,,
shows,"{'name': '9号酒馆', 'publisher': '9号酒馆', 'uri': '...","{'name': 'Allgemein gebildet ', 'publisher': '...","{'name': 'Auf Deutsch gesagt!', 'publisher': '...","{'name': 'Auf den Punkt', 'publisher': 'Süddeu...","{'name': 'Bauerfeind + Kuttner', 'publisher': ...","{'name': 'DW Langsam Gesprochene Nachrichten',...","{'name': 'Deutsch Denken: Learn German Easy', ...","{'name': 'Deutsch Podcast - Deutsch lernen', '...",{'name': 'Deutsche im Alltag – Alltagsdeutsch ...,"{'name': 'Dick & Doof', 'publisher': 'RTL+ / l...",...,,,,,,,,,,
episodes,{'name': 'Zeit zu kündigen! Wenn der Traumjob ...,{'name': 'EP3 / 其實我是個自私的人 | 不知不覺成為了自己不想成為的那個樣子...,{'name': '3 Hacks For Rapid Reading (How To Re...,"{'name': 'Angel Of Love - Perfect Blue', 'show...",,,,,,,...,,,,,,,,,,
bannedTracks,,,,,,,,,,,...,,,,,,,,,,
artists,"{'name': '(G)I-DLE', 'uri': 'spotify:artist:2A...","{'name': '015B', 'uri': 'spotify:artist:4uU7Kf...","{'name': '3rd Line Butterfly', 'uri': 'spotify...","{'name': 'A9', 'uri': 'spotify:artist:6CY2I2M5...","{'name': 'AKAIKO-EN', 'uri': 'spotify:artist:5...","{'name': 'AKB48', 'uri': 'spotify:artist:01wau...","{'name': 'ASIAN KUNG-FU GENERATION', 'uri': 's...","{'name': 'Acid Black Cherry', 'uri': 'spotify:...","{'name': 'Ai Higuchi', 'uri': 'spotify:artist:...","{'name': 'Aimer', 'uri': 'spotify:artist:0bAsR...",...,,,,,,,,,,
bannedArtists,,,,,,,,,,,...,,,,,,,,,,
other,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# select only 'tracks' (liked songs)
liked_songs_json = your_library.loc['tracks'].to_json(orient='index')   # json
liked_songs = pd.read_json(liked_songs_json, orient='index')            # json to dataframe

liked_songs

Unnamed: 0,artist,album,track,uri
0,SCANDAL,SAKURA グッバイ,SAKURAグッバイ,spotify:track:5KiNYsESpC8edKzvbkXb8q
1,Kinokoteikoku,フェイクワールドワンダーランド,クロノスタシス,spotify:track:6Mavd9uk89Y9OTdvcDTfGG
2,L'Arc-en-Ciel,GOOD LUCK MY WAY,GOOD LUCK MY WAY,spotify:track:3s03DDJuyw1EYX9a93LvQY
3,Gackt,LAST MOON,暁月夜 -DAY BREAKERS-,spotify:track:3eoO9fZMgyxpft3GdgrQWR
4,The Burning Deadwoods,Turn Me On,Turn Me On,spotify:track:3K91YYcl61HLsEy6M4YJgJ
...,...,...,...,...
2450,TWICE,Taste of Love,Alcohol-Free,spotify:track:0BTaaKT4RMbs5M73tOHX5Y
2451,L'Arc-en-Ciel,DAYBREAK'S BELL,DAYBREAK'S BELL,spotify:track:5qQcSJw6K9ax9CN7z4EBYJ
2452,CHANMINA,Mirror,Mirror,spotify:track:6tF5SVu63mcy3bxzqpTiap
2453,SID,嘘,日傘,spotify:track:5hIL2QM51QXQyV4voiqsvx


In [4]:
# dict of dataframes
streaming_dict = {}

# read streaming history by parts
# determine how many "StreamingHistory" files you have
parts = 0
for file in os.listdir():
    if file.startswith('StreamingHistory'):
        parts += 1

for i in range(parts):
    streaming_dict[f'df_{i}'] = pd.read_json(f"StreamingHistory{i}.json")

# combine dataframes
streaming_history = pd.concat(streaming_dict.values(), ignore_index=True)
print(f'Total: {len(streaming_history)}')

# remove duplicates
streaming_history.drop_duplicates(inplace=True)
print(f'Remove duplicates: {len(streaming_history)}')

# remove entries less than 30 seconds (spotify only counts streams longer than 30 seconds)
streaming_history = streaming_history[streaming_history['msPlayed'] > 30000]
print(f'Remove entries less than 30 seconds: {len(streaming_history)}')

# convert 'endTime' column from str to datetime
streaming_history['endTime'] = pd.to_datetime(streaming_history['endTime'], format = '%Y-%m-%d %H:%M')

streaming_history

Total: 95110
Remove duplicates: 93184
Remove entries less than 30 seconds: 75065


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-12-24 21:05:00,IU,Dear Name,142145
1,2020-12-26 00:02:00,Royal Philharmonic Orchestra,"Part Of Your World - From ""The Little Mermaid""",252702
2,2020-12-26 00:06:00,Kaori Muraji,"Merry Go Round of Life (Arr. Koseki) - From ""H...",242935
3,2020-12-26 00:12:00,Yoko Kanno,Hana wa Saku,209247
5,2020-12-26 16:00:00,Ryuichi Sakamoto,Merry Christmas Mr. Lawrence,329212
...,...,...,...,...
95105,2024-12-30 05:44:00,my little airport,西湖沒有中秋,127760
95106,2024-12-30 05:49:00,Dear Jane,遠征,245632
95107,2024-12-30 05:52:00,JFFT,JFFSONG,178452
95108,2024-12-30 05:55:00,Kiri T,傷心的時候別說話,201800


In [5]:
# filter dataframe by year
df_2021 = streaming_history[streaming_history['endTime'].dt.year == 2021]
df_2022 = streaming_history[streaming_history['endTime'].dt.year == 2022]
df_2023 = streaming_history[streaming_history['endTime'].dt.year == 2023]
df_2024 = streaming_history[streaming_history['endTime'].dt.year == 2024]

df_2024

Unnamed: 0,endTime,artistName,trackName,msPlayed
75481,2024-01-01 14:36:00,Eason Chan,今天只做一件事,229906
75482,2024-01-01 14:38:00,COLLAR,Never-never Land,154436
75483,2024-01-01 14:42:00,[Alexandros],todayyyyy,228400
75484,2024-01-01 14:46:00,yonige,DRIVE,174133
75485,2024-01-01 14:50:00,[Alexandros],あまりにも素敵な夜だから,238066
...,...,...,...,...
95105,2024-12-30 05:44:00,my little airport,西湖沒有中秋,127760
95106,2024-12-30 05:49:00,Dear Jane,遠征,245632
95107,2024-12-30 05:52:00,JFFT,JFFSONG,178452
95108,2024-12-30 05:55:00,Kiri T,傷心的時候別說話,201800


### Compare top artists and tracks by *frequency* and *minutes*
- *frequency* seems to be the more sensible metric

In [6]:
def get_top_artists_tracks_frequency(df_year):
    # create empty dataframe to contain all the information
    df = pd.DataFrame(columns=['Favourite artists (up to Nov)', 'Favourite artists (up to Dec)', 'Favourite tracks (up to Nov)', 'Favourite tracks (up to Dec)'])

    # number of top artists/tracks to get
    top_num = 10

    # filter by time
    streaming_history_dec = df_year # up to Dec
    streaming_history_nov = streaming_history_dec[streaming_history_dec['endTime'].dt.month < 12]   # up to Nov
    #############################################################################
    # get top artists
    top_artists_nov = streaming_history_nov['artistName'].value_counts().head(top_num)  # up to Nov
    top_artists_dec = streaming_history_dec['artistName'].value_counts().head(top_num)  # up to Dec

    # get top tracks
    top_tracks_nov = streaming_history_nov['trackName'].value_counts().head(top_num)    # up to Nov
    top_tracks_dec = streaming_history_dec['trackName'].value_counts().head(top_num)    # up to Dec
    #############################################################################
    # add to dataframe
    df['Favourite artists (up to Nov)'] = [(top_artists_nov.index[i], round(top_artists_nov[i])) for i in range(top_num)]
    df['Favourite artists (up to Dec)'] = [(top_artists_dec.index[i], round(top_artists_dec[i])) for i in range(top_num)]
    df['Favourite tracks (up to Nov)'] = [(top_tracks_nov.index[i], round(top_tracks_nov[i])) for i in range(top_num)]
    df['Favourite tracks (up to Dec)'] = [(top_tracks_dec.index[i], round(top_tracks_dec[i])) for i in range(top_num)]

    return df


def get_top_artists_tracks_minutes(df_year):
    # create empty dataframe to contain all the information
    df = pd.DataFrame(columns=['Favourite artists (up to Nov)', 'Favourite artists (up to Dec)', 'Favourite tracks (up to Nov)', 'Favourite tracks (up to Dec)'])

    # number of top artists/tracks to get
    top_num = 10

    # filter by time
    streaming_history_dec = df_year # up to Dec
    streaming_history_nov = streaming_history_dec[streaming_history_dec['endTime'].dt.month < 12]   # up to Nov
    #############################################################################
    # count minutes per artist
    minutes_per_artist_nov = streaming_history_nov.groupby('artistName')['msPlayed'].sum() / 60000  # up to Nov
    minutes_per_artist_dec = streaming_history_dec.groupby('artistName')['msPlayed'].sum() / 60000  # up to Dec

    # sort by minutes
    minutes_per_artist_nov.sort_values(ascending=False, inplace=True)
    minutes_per_artist_dec.sort_values(ascending=False, inplace=True)

    # get top artists
    top_artists_nov = minutes_per_artist_nov.head(top_num)
    top_artists_dec = minutes_per_artist_dec.head(top_num)
    #############################################################################
    # count minutes per track
    minutes_per_track_nov = streaming_history_nov.groupby('trackName')['msPlayed'].sum() / 60000    # up to Nov
    minutes_per_track_dec = streaming_history_dec.groupby('trackName')['msPlayed'].sum() / 60000    # up to Dec

    # sort by minutes
    minutes_per_track_nov.sort_values(ascending=False, inplace=True)
    minutes_per_track_dec.sort_values(ascending=False, inplace=True)

    # get top tracks
    top_tracks_nov = minutes_per_track_nov.head(top_num)
    top_tracks_dec = minutes_per_track_dec.head(top_num)
    #############################################################################
    # add to dataframe
    df['Favourite artists (up to Nov)'] = [(top_artists_nov.index[i], round(top_artists_nov[i])) for i in range(top_num)]
    df['Favourite artists (up to Dec)'] = [(top_artists_dec.index[i], round(top_artists_dec[i])) for i in range(top_num)]
    df['Favourite tracks (up to Nov)'] = [(top_tracks_nov.index[i], round(top_tracks_nov[i])) for i in range(top_num)]
    df['Favourite tracks (up to Dec)'] = [(top_tracks_dec.index[i], round(top_tracks_dec[i])) for i in range(top_num)]

    return df

In [7]:
top_artists_tracks_freq_2021 = get_top_artists_tracks_frequency(df_2021)
top_artists_tracks_freq_2022 = get_top_artists_tracks_frequency(df_2022)
top_artists_tracks_freq_2023 = get_top_artists_tracks_frequency(df_2023)
top_artists_tracks_freq_2024 = get_top_artists_tracks_frequency(df_2024)

top_artists_tracks_min_2021 = get_top_artists_tracks_minutes(df_2021)
top_artists_tracks_min_2022 = get_top_artists_tracks_minutes(df_2022)
top_artists_tracks_min_2023 = get_top_artists_tracks_minutes(df_2023)
top_artists_tracks_min_2024 = get_top_artists_tracks_minutes(df_2024)

In [8]:
# frequency
top_artists_tracks_freq_2024

Unnamed: 0,Favourite artists (up to Nov),Favourite artists (up to Dec),Favourite tracks (up to Nov),Favourite tracks (up to Dec)
0,"(SCANDAL, 376)","(SCANDAL, 380)","(COVER STORY, 65)","(COVER STORY, 67)"
1,"(my little airport, 266)","(my little airport, 282)","(米酒, 62)","(米酒, 62)"
2,"(Dear Jane, 196)","(Dear Jane, 215)","(今次動真格, 47)","(有些話要用英文說, 52)"
3,"(ONE OK ROCK, 190)","(ONE OK ROCK, 196)","(緊急聯絡人, 43)","(今次動真格, 51)"
4,"(Kiri T, 156)","(Kiri T, 192)","(有些話要用英文說, 43)","(踊り子, 48)"
5,"(Endy Chow, 155)","(Endy Chow, 164)","(至少做一件離譜的事, 42)","(緊急聯絡人, 47)"
6,"(Mr., 147)","(Lolly Talk, 160)","(每次你走的時分, 41)","(至少做一件離譜的事, 45)"
7,"(Lolly Talk, 145)","(Mr., 150)","(歌鼓勵人, 40)","(數到十, 43)"
8,"(Hitsujibungaku, 130)","(Hitsujibungaku, 135)","(Hands Up - JFYT Version, 39)","(歌鼓勵人, 43)"
9,"([Alexandros], 129)","([Alexandros], 131)","(踊り子, 38)","(每次你走的時分, 41)"


In [9]:
# minutes
top_artists_tracks_min_2024

Unnamed: 0,Favourite artists (up to Nov),Favourite artists (up to Dec),Favourite tracks (up to Nov),Favourite tracks (up to Dec)
0,"(SCANDAL, 1529)","(SCANDAL, 1546)","(COVER STORY, 191)","(COVER STORY, 197)"
1,"(Dear Jane, 764)","(Dear Jane, 834)","(米酒, 168)","(踊り子, 185)"
2,"(ONE OK ROCK, 738)","(my little airport, 760)","(至少做一件離譜的事, 166)","(至少做一件離譜的事, 177)"
3,"(my little airport, 721)","(ONE OK ROCK, 760)","(緊急聯絡人, 152)","(數到十, 177)"
4,"(Endy Chow, 684)","(Endy Chow, 722)","(今次動真格, 150)","(米酒, 168)"
5,"(Mr., 616)","(Kiri T, 656)","(踊り子, 147)","(緊急聯絡人, 166)"
6,"(Kiri T, 538)","(Mr., 630)","(數到十, 144)","(有些話要用英文說, 164)"
7,"(Lolly Talk, 532)","(Lolly Talk, 589)","(六度相隔理論, 139)","(今次動真格, 163)"
8,"(Hitsujibungaku, 514)","(Hitsujibungaku, 534)","(有些話要用英文說, 137)","(六度相隔理論, 147)"
9,"([Alexandros], 506)","([Alexandros], 515)","(每次你走的時分, 124)","(每次你走的時分, 124)"


### Get top artists in the library
- based on *frequency*

In [10]:
# filter out top artists and its corresponding counts
artist_count = 30
top_artists = liked_songs['artist'].value_counts().head(artist_count)

# total number of liked songs
total_songs = len(liked_songs)

top_artists

SCANDAL                     139
AKB48                        54
SID                          51
[Alexandros]                 45
Mr.                          42
my little airport            41
Apink                        40
the GazettE                  39
Dear Jane                    38
Leo Ieiri                    36
SILENT SIREN                 34
ONE OK ROCK                  34
Aimyon                       30
TWICE                        30
TAEYEON                      26
Endy Chow                    24
ASIAN KUNG-FU GENERATION     23
Hitsujibungaku               21
Eason Chan                   21
SPYAIR                       20
BLACKPINK                    18
YUI                          17
IU                           17
SUPER BEAVER                 16
THE ORAL CIGARETTES          15
L'Arc-en-Ciel                15
LiSA                         15
Supper Moment                15
Luli Lee                     15
back number                  15
Name: artist, dtype: int64

### Get time played each year

In [11]:
def get_time_played(df):
    # dataframe containing time played per month
    time_df = df.groupby(df['endTime'].dt.month)['msPlayed'].sum() / 60000    # convert ms to minutes

    # total time played in that year
    total_time = time_df.sum()

    return time_df, total_time

In [12]:
time_df_2021, total_time_2021 = get_time_played(df_2021)
time_df_2022, total_time_2022 = get_time_played(df_2022)
time_df_2023, total_time_2023 = get_time_played(df_2023)
time_df_2024, total_time_2024 = get_time_played(df_2024)

print(f'Total time played in 2021: {total_time_2021:.0f} minutes')
print(f'Total time played in 2022: {total_time_2022:.0f} minutes')
print(f'Total time played in 2023: {total_time_2023:.0f} minutes')
print(f'Total time played in 2024: {total_time_2024:.0f} minutes')

Total time played in 2021: 70989 minutes
Total time played in 2022: 80060 minutes
Total time played in 2023: 74132 minutes
Total time played in 2024: 48726 minutes


In [13]:
# deduce the cutoff date of spotify wrapped using the total time reported there

def get_cutoff(df, total_min):
    # convert min to ms
    total_min_ms = total_min * 60000

    # find the date when total time played is reached
    time_played = 0

    for i in range(len(df)):
        time_played += df['msPlayed'].iloc[i]

        if time_played >= total_min_ms:
            return df['endTime'].iloc[i].strftime('%Y-%m-%d')

In [14]:
# total time played from spotify wrapped (minutes)
total_min_2021 = 62606
total_min_2022 = 69596
total_min_2023 = 71083
total_min_2024 = 45302

cutoff_2021 = get_cutoff(df_2021, total_min_2021)
cutoff_2022 = get_cutoff(df_2022, total_min_2022)
cutoff_2023 = get_cutoff(df_2023, total_min_2023)
cutoff_2024 = get_cutoff(df_2024, total_min_2024)

print(f'Estimated cutoff date for 2021: {cutoff_2021}')
print(f'Estimated cutoff date for 2022: {cutoff_2022}')
print(f'Estimated cutoff date for 2023: {cutoff_2023}')
print(f'Estimated cutoff date for 2024: {cutoff_2024}')

Estimated cutoff date for 2021: 2021-11-19
Estimated cutoff date for 2022: 2022-11-11
Estimated cutoff date for 2023: 2023-11-15
Estimated cutoff date for 2024: 2024-11-23


In [15]:
def get_top_artists_tracks_frequency_cutoff(df_year, cutoff):
    # create empty dataframe to contain all the information
    df = pd.DataFrame(columns=['Favourite artists (up to cutoff)', 'Favourite tracks (up to cutoff)'])

    # number of top artists/tracks to get
    top_num = 10

    # filter by time
    streaming_history_cutoff = df_year[df_year['endTime'].dt.strftime('%Y-%m-%d') <= cutoff]
    #############################################################################
    # get top artists
    top_artists_cutoff = streaming_history_cutoff['artistName'].value_counts().head(top_num)  

    # get top tracks
    top_tracks_cutoff = streaming_history_cutoff['trackName'].value_counts().head(top_num)   
    #############################################################################
    # add to dataframe
    df['Favourite artists (up to cutoff)'] = [(top_artists_cutoff.index[i], round(top_artists_cutoff[i])) for i in range(top_num)]
    df['Favourite tracks (up to cutoff)'] = [(top_tracks_cutoff.index[i], round(top_tracks_cutoff[i])) for i in range(top_num)]
    
    return df

In [16]:
top_artists_tracks_cutoff_2024 = get_top_artists_tracks_frequency_cutoff(df_2024, cutoff_2024)
top_artists_tracks_cutoff_2024

Unnamed: 0,Favourite artists (up to cutoff),Favourite tracks (up to cutoff)
0,"(SCANDAL, 376)","(COVER STORY, 65)"
1,"(my little airport, 266)","(米酒, 62)"
2,"(Dear Jane, 196)","(今次動真格, 47)"
3,"(ONE OK ROCK, 190)","(緊急聯絡人, 43)"
4,"(Endy Chow, 155)","(至少做一件離譜的事, 42)"
5,"(Kiri T, 149)","(每次你走的時分, 41)"
6,"(Mr., 147)","(有些話要用英文說, 41)"
7,"(Lolly Talk, 145)","(歌鼓勵人, 40)"
8,"(Hitsujibungaku, 130)","(Hands Up - JFYT Version, 39)"
9,"([Alexandros], 127)","(六度相隔理論, 36)"


### Get favourite artists and tracks each year
- based on *frequency*

In [17]:
def get_favourite_artists(df):
    # get the top 10 artists by name
    favourite_artists = df['artistName'].value_counts().head(10)
    favourite_artist_names = favourite_artists.index.tolist()

    # for each artist, get the number of times it was played in each month, store the cumulative count

    # create dataframe with month as the first column
    artist_count = pd.DataFrame({'Month': range(1, 13)}, index=range(1, 13))

    for artist in favourite_artist_names:
        # filter df for the current artist
        artist_df = df[df['artistName'] == artist]
                    
        # group by month and calculate the cumulative count
        artist_count[artist] = artist_df.groupby(df['endTime'].dt.month)['endTime'].count().cumsum()

        # check if there are missing months, in case that artist was not played in that month
        # fill the NaN values with previous month's count
        artist_count = artist_count.fillna(method='ffill')

        # if there are still NaN values, it should be in the first month
        # fill the NaN values with 0
        artist_count = artist_count.fillna(0)

        # convert counts to integers
        artist_count = artist_count.astype(int)

    return artist_count

def get_favourite_tracks(df):
    # get the top 10 tracks by name
    favourite_tracks = df['trackName'].value_counts().head(10)
    favourite_track_names = favourite_tracks.index.tolist()

    # for each track, get the number of times it was played in each month, store the cumulative count

    # create dataframe with month as the first column
    track_count = pd.DataFrame({'Month': range(1, 13)}, index=range(1, 13))

    for track in favourite_track_names:
        # filter df for the current track
        track_df = df[df['trackName'] == track]
                    
        # group by month and calculate the cumulative count
        track_count[track] = track_df.groupby(df['endTime'].dt.month)['endTime'].count().cumsum()

        # check if there are missing months, in case that track was not played in that month
        # fill the NaN values with previous month's count
        track_count = track_count.fillna(method='ffill')

        # if there are still NaN values, it should be in the first month
        # fill the NaN values with 0
        track_count = track_count.fillna(0)

        # convert counts to integers
        track_count = track_count.astype(int)

    return track_count

In [18]:
artist_count_2021 = get_favourite_artists(df_2021)
artist_count_2022 = get_favourite_artists(df_2022)
artist_count_2023 = get_favourite_artists(df_2023)
artist_count_2024 = get_favourite_artists(df_2024)

track_count_2021 = get_favourite_tracks(df_2021)
track_count_2022 = get_favourite_tracks(df_2022)
track_count_2023 = get_favourite_tracks(df_2023)
track_count_2024 = get_favourite_tracks(df_2024)

In [19]:
artist_count_2024

Unnamed: 0,Month,SCANDAL,my little airport,Dear Jane,ONE OK ROCK,Kiri T,Endy Chow,Lolly Talk,Mr.,Hitsujibungaku,[Alexandros]
1,1,39,44,23,7,1,13,4,12,15,12
2,2,69,63,38,13,1,26,8,44,29,19
3,3,142,100,69,26,4,45,9,74,45,26
4,4,165,122,82,40,13,47,15,79,51,38
5,5,202,155,92,46,15,56,59,93,66,56
6,6,238,176,110,56,29,103,66,107,75,67
7,7,264,209,131,60,61,125,78,118,84,74
8,8,299,224,143,67,84,135,113,124,92,83
9,9,330,234,163,84,98,144,127,129,104,104
10,10,367,250,181,181,121,149,141,134,129,123


### Plot everything together
with the option to animate how the favourite artists and songs change with time
- Top artists in library (1,1)
- Time played (2,1)
- Favourite artists (1,2)
- Favourite songs (2,2)

In [20]:
# flag to turn on/off the animation
animation = True
#############################################################################
# define ranges
total_years = 4 # 2021 to 2024, change this if needed

total_months = 12   # 12 months
total_artists = 10 # top 10 artists
total_tracks = 10   # top 10 tracks

# default color palette
colors = px.colors.qualitative.Plotly
color_cycle = itertools.cycle(colors)
#############################################################################
# subplots titles
# (1,1): Top artists in library
title_1_1 = f"Top 30 artists in the library (out of {total_songs} songs)"
# (2,1): Time played
title_2_1 = "Time played"
# (1,2): Favourite artists
title_1_2 = "Favourite artists"
# (2,2): Favourite songs
title_2_2 = "Favourite songs"
# Use subplots to make a 2x2 plot
fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'xy'}, {'type': 'xy'}], 
                                           [{'type': 'xy'}, {'type': 'xy'}]],
                    subplot_titles=(title_1_1, title_1_2, title_2_1, title_2_2))

# Overall title
fig.update_layout(title ={'text': "My Spotify Wrapped",
                          'font': {'size': 25},
                          'x': 0.45})
#############################################################################
# Add traces

# (1,1): Top artists in library
# in case the artist name is too long, truncate it
truncate_length = 20
top_artists_truncate = [name[:truncate_length] + '...' if len(name) > truncate_length else name for name in top_artists.index]

fig.add_trace(go.Bar(x=top_artists_truncate, y=top_artists.values,
                    marker=dict(color=[next(color_cycle) for _ in top_artists.index])), row=1, col=1)
fig.update_layout(xaxis_tickangle=-45)
# hide legend for this bar chart, only show legend for artists/songs
fig.update_traces(showlegend=False, row=1, col=1)

# (1,1): Top artists in library can also be plotted as a pie chart if you prefer
# but 'type' for `make_subplots` has to be set to 'domain' instead of 'xy': specs=[[{'type': 'domain'}, {'type': 'xy'}], [{'type': 'xy'}, {'type': 'xy'}]]
# fig.add_trace(go.Pie(labels=top_artists.index, values=top_artists.values), row=1, col=1)
# hide legend for this pie chart, only show legend for artists/songs
# fig.update_traces(textposition='inside', textinfo='percent+label', showlegend=False, row=1, col=1)
#############################################################################
# (2,1): Time played

color_2021 = '#1DB954'
color_2022 = '#1E6FBA'
color_2023 = '#445AE9'
color_2024 = '#FFC72C'

# 2021
fig.add_scatter(x = time_df_2021.index, y = time_df_2021.values, line=dict(color=color_2021), row=2, col=1)
fig.add_annotation(x=time_df_2021.index[1], y=time_df_2021.values[1], text=f"2021 total: {total_time_2021:.0f}", font=dict(color=color_2021), arrowcolor = color_2021, row = 2, col = 1)

# 2022
fig.add_scatter(x = time_df_2022.index, y = time_df_2022.values, line=dict(color=color_2022), row=2, col=1)
fig.add_annotation(x=time_df_2022.index[4], y=time_df_2022.values[4], text=f"2022 total: {total_time_2022:.0f}", font=dict(color=color_2022), arrowcolor = color_2022, row = 2, col = 1)

# 2023
fig.add_scatter(x = time_df_2023.index, y = time_df_2023.values, line=dict(color=color_2023), row=2, col=1)
fig.add_annotation(x=time_df_2023.index[7], y=time_df_2023.values[7], text=f"2023 total: {total_time_2023:.0f}", font=dict(color=color_2023), arrowcolor = color_2023, row = 2, col = 1)

# 2024
fig.add_scatter(x = time_df_2024.index, y = time_df_2024.values, line=dict(color=color_2024), row=2, col=1)
fig.add_annotation(x=time_df_2024.index[9], y=time_df_2024.values[9], text=f"2024 total: {total_time_2024:.0f}", font=dict(color=color_2024), arrowcolor = color_2024, row = 2, col = 1)

# hide legend for this line chart, only show legend for artists/songs plots
fig.update_traces(showlegend=False, row=2, col=1)

fig.update_xaxes(title_text = "Month", dtick = 1, row=2, col=1)
fig.update_yaxes(title_text = "Minutes", row=2, col=1)
#############################################################################
# (1,2): Favourite artists
# 2021
for artist in artist_count_2021.columns[1:]:
    fig.add_trace(
        go.Scatter(x=artist_count_2021['Month'],
                   y=artist_count_2021[artist],
                   name=artist,
                   legendgroup='1',
                   visible=True,
                   line=dict(dash="solid")),
                   row=1, col=2)

# 2022
for artist in artist_count_2022.columns[1:]:
    fig.add_trace(
        go.Scatter(x=artist_count_2022['Month'],
                   y=artist_count_2022[artist],
                   name=artist,
                   legendgroup='1',
                   visible=False,
                   line=dict(dash="solid")),
                   row=1, col=2)
    
# 2023
for artist in artist_count_2023.columns[1:]:
    fig.add_trace(
        go.Scatter(x=artist_count_2023['Month'],
                   y=artist_count_2023[artist],
                   name=artist,
                   legendgroup='1',
                   visible=False,
                   line=dict(dash="solid")),
                   row=1, col=2)

# 2024
for artist in artist_count_2024.columns[1:]:
    fig.add_trace(
        go.Scatter(x=artist_count_2024['Month'],
                   y=artist_count_2024[artist],
                   name=artist,
                   legendgroup='1',
                   visible=False,
                   line=dict(dash="solid")),
                   row=1, col=2)

fig.update_xaxes(title_text = "Month", dtick = 1, row=1, col=2)
fig.update_yaxes(title_text = "Accumulated frequency", row=1, col=2)
#############################################################################
# (2,2): Favourite songs
# 2021
for track in track_count_2021.columns[1:]:
    fig.add_trace(
        go.Scatter(x=track_count_2021['Month'],
                   y=track_count_2021[track],
                   name=track,
                   legendgroup='2',
                   visible=True,
                   line=dict(dash="solid")),
                   row=2, col=2)

# 2022
for track in track_count_2022.columns[1:]:
    fig.add_trace(
        go.Scatter(x=track_count_2022['Month'],
                   y=track_count_2022[track],
                   name=track,
                   legendgroup='2',
                   visible=False,
                   line=dict(dash="solid")),
                   row=2, col=2)

# 2023
for track in track_count_2023.columns[1:]:
    fig.add_trace(
        go.Scatter(x=track_count_2023['Month'],
                   y=track_count_2023[track],
                   name=track,
                   legendgroup='2',
                   visible=False,
                   line=dict(dash="solid")),
                   row=2, col=2)

# 2024
for track in track_count_2024.columns[1:]:
    fig.add_trace(
        go.Scatter(x=track_count_2024['Month'],
                   y=track_count_2024[track],
                   name=track,
                   legendgroup='2',
                   visible=False,
                   line=dict(dash="solid")),
                   row=2, col=2)

fig.update_xaxes(title_text = "Month", dtick = 1, row=2, col=2)
fig.update_yaxes(title_text = "Accumulated frequency", row=2, col=2)
#############################################################################
# button settings

# button coordinates
button_x = 1.2
button_y = 1.0

# visible traces: control which plots are visible; use this to show/hide plots for a specific year
# total number of traces: top artists in library (1) + time played (n years) + favourite artists (10 x n years) + favourite songs (10 x n years)
# if n = 4 years, total_traces should be 1 + 4 + 40 + 40 = 85
total_traces = 1 + total_years + total_artists * total_years + total_tracks * total_years

# 0-4: top artists in library (1) and time played (4 years)
# 5-14: 2021 favourite artists
# 15-24: 2022 favourite artists
# 25-34: 2023 favourite artists
# 35-44: 2024 favourite artists
# 45-54: 2021 favourite songs
# 55-64: 2022 favourite songs
# 65-74: 2023 favourite songs
# 75-84: 2024 favourite songs
artist_start = 1 + total_years    # artist traces start at 5
track_start = 1 + total_years + total_artists * total_years    # track traces start at 45

# initialise all traces to be invisible, except for top artists in library (1) and time played (4 years) which are always visible
visible_all = [False] * total_traces
visible_all[0:1+total_years] = [True] * (1+total_years)

# turn on the traces for each year
for year in range(total_years):
    # 2021
    if year == 0:
        visible_2021 = visible_all[:]
        # show artist traces
        visible_2021[artist_start + year * total_artists : artist_start + (year + 1) * total_artists] = [True] * total_artists
        # show track traces
        visible_2021[track_start + year * total_tracks : track_start + (year + 1) * total_tracks] = [True] * total_tracks
    # 2022
    elif year == 1:
        visible_2022 = visible_all[:]
        # show artist traces
        visible_2022[artist_start + year * total_artists : artist_start + (year + 1) * total_artists] = [True] * total_artists
        # show track traces
        visible_2022[track_start + year * total_tracks : track_start + (year + 1) * total_tracks] = [True] * total_tracks
    # 2023
    elif year == 2:
        visible_2023 = visible_all[:]
        # show artist traces
        visible_2023[artist_start + year * total_artists : artist_start + (year + 1) * total_artists] = [True] * total_artists
        # show track traces
        visible_2023[track_start + year * total_tracks : track_start + (year + 1) * total_tracks] = [True] * total_tracks
    # 2024
    elif year == 3:
        visible_2024 = visible_all[:]
        # show artist traces
        visible_2024[artist_start + year * total_artists : artist_start + (year + 1) * total_artists] = [True] * total_artists
        # show track traces
        visible_2024[track_start + year * total_tracks : track_start + (year + 1) * total_tracks] = [True] * total_tracks
#############################################################################
# plotly does not support individual legend for each subplot... 
# so we are using this gap between the legends to pretend we have 2 separate legends...
fig.update_layout(legend_tracegroupgap = 300)
#############################################################################
#############################################################################
#############################################################################
# simpler version without animating the traces as a function of time
if not animation:
    fig.update_layout(
        updatemenus=[
            dict(
                x = button_x,
                y = button_y,
                type="buttons",
                direction="down",
                showactive=False,
                buttons=list([
                    dict(label="2021",
                        method="update",
                        args=[{"visible": visible_2021},    # set visible traces for 2021
                            {"showlegend": True}]),
                    dict(label="2022",
                        method="update",
                        args=[{"visible": visible_2022},    # set visible traces for 2022
                            {"showlegend": True}]),
                    dict(label="2023",
                        method="update",
                        args=[{"visible": visible_2023},    # set visible traces for 2023
                            {"showlegend": True}]),
                    dict(label="2024",
                        method="update",
                        args=[{"visible": visible_2024},    # set visible traces for 2024
                            {"showlegend": True}]),
                ]))])

    fig.write_html("My Spotify Wrapped_notanimated.html", include_plotlyjs = 'cdn', auto_play = False)
    fig.show()
#############################################################################
#############################################################################
#############################################################################
# complicated version animating the traces as a function of time
if animation:
    # duration: control the speed of the animation
    frame_duration = 250    # ms
    transition_duration = 100   # ms

    # add animation frames
    frames = [go.Frame(data=[go.Bar(visible=True)] +    # top artists in library
                            [go.Scatter(visible=True)] * total_years +    # time played
                            [go.Scatter(x=artist_count_2021['Month'][:k], y=artist_count_2021[artist][:k]) for artist in artist_count_2021.columns[1:]] +   # favourite artists
                            [go.Scatter(x=artist_count_2022['Month'][:k], y=artist_count_2022[artist][:k]) for artist in artist_count_2022.columns[1:]] +
                            [go.Scatter(x=artist_count_2023['Month'][:k], y=artist_count_2023[artist][:k]) for artist in artist_count_2023.columns[1:]] +
                            [go.Scatter(x=artist_count_2024['Month'][:k], y=artist_count_2024[artist][:k]) for artist in artist_count_2024.columns[1:]] +
                            [go.Scatter(x=track_count_2021['Month'][:k], y=track_count_2021[track][:k]) for track in track_count_2021.columns[1:]] +    # favourite songs
                            [go.Scatter(x=track_count_2022['Month'][:k], y=track_count_2022[track][:k]) for track in track_count_2022.columns[1:]] +
                            [go.Scatter(x=track_count_2023['Month'][:k], y=track_count_2023[track][:k]) for track in track_count_2023.columns[1:]] +
                            [go.Scatter(x=track_count_2024['Month'][:k], y=track_count_2024[track][:k]) for track in track_count_2024.columns[1:]],
                            traces=list(range(total_traces)))   # total number of traces
                            for k in range(total_months + 1)]   # x axis range: 1 to 12 months

    fig.frames=frames
    #############################################################################
    fig.update_layout(
        updatemenus=[
            dict(
                x = button_x,
                y = button_y,
                type="buttons",
                direction="down",
                showactive=False,
                buttons=list([
                    dict(label="Play",
                            method="animate",
                        args=[None, dict(frame=dict(duration=frame_duration, redraw=False), # animate button
                                        transition=dict(duration=transition_duration))]),
                    dict(label="2021",
                        method="update",
                        args=[{"visible": visible_2021},    # set visible traces for 2021
                            {"showlegend": True}]),
                    dict(label="2022",
                        method="update",
                        args=[{"visible": visible_2022},    # set visible traces for 2022
                            {"showlegend": True}]),
                    dict(label="2023",
                        method="update",
                        args=[{"visible": visible_2023},    # set visible traces for 2023
                            {"showlegend": True}]),
                    dict(label="2024",
                        method="update",
                        args=[{"visible": visible_2024},    # set visible traces for 2024
                            {"showlegend": True}]),
                ]))])

    fig.write_html("My Spotify Wrapped_animated.html", include_plotlyjs = 'cdn', auto_play = False)
    fig.show()