In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from utils import SpotifyExtractor
pd.set_option('display.max_columns', None)

# Initialize Spotify API

In [2]:
spotify_extractor = SpotifyExtractor(client_id=os.getenv("SPOTIFY_CLIENT_ID"), client_secret=os.getenv("SPOTIFY_CLIENT_SECRET"))

## Test endpoints

In [3]:
spotify_extractor.use_api(service='artists', id='3HqSLMAZ3g3d5poNaI7GOU')

{'external_urls': {'spotify': 'https://open.spotify.com/artist/3HqSLMAZ3g3d5poNaI7GOU'},
 'followers': {'href': None, 'total': 9018664},
 'genres': ['k-pop', 'k-ballad'],
 'href': 'https://api.spotify.com/v1/artists/3HqSLMAZ3g3d5poNaI7GOU',
 'id': '3HqSLMAZ3g3d5poNaI7GOU',
 'images': [{'url': 'https://i.scdn.co/image/ab6761610000e5ebbd0642ff425698afac5caffd',
   'height': 640,
   'width': 640},
  {'url': 'https://i.scdn.co/image/ab67616100005174bd0642ff425698afac5caffd',
   'height': 320,
   'width': 320},
  {'url': 'https://i.scdn.co/image/ab6761610000f178bd0642ff425698afac5caffd',
   'height': 160,
   'width': 160}],
 'name': 'IU',
 'popularity': 68,
 'type': 'artist',
 'uri': 'spotify:artist:3HqSLMAZ3g3d5poNaI7GOU'}

In [4]:
spotify_extractor.get_track(track_id='3AoEQRuFf8zVXWqSLo2UOi')

{'album_id': '538vEfAgLJ6g2I8ubuOlap',
 'duration_ms': 164554,
 'track_popularity': 79}

In [5]:
spotify_extractor.get_tracks(track_ids=['3AoEQRuFf8zVXWqSLo2UOi'])

[{'album_id': '538vEfAgLJ6g2I8ubuOlap',
  'duration_ms': 164554,
  'track_popularity': 79}]

In [6]:
spotify_extractor.get_album(album_id='2IYQwwgxgOIn7t3iF6ufFD')

{'release_date': '2024-10-18',
 'album_popularity': 83,
 'image_url': 'https://i.scdn.co/image/ab67616d0000b27336032cb4acd9df050bc2e197'}

In [7]:
spotify_extractor.get_albums(album_ids=['2IYQwwgxgOIn7t3iF6ufFD'])

[{'release_date': '2024-10-18',
  'release_date_precision': 'day',
  'album_popularity': 83,
  'image_url': 'https://i.scdn.co/image/ab67616d00001e0236032cb4acd9df050bc2e197'}]

# Process streaming history data

In [8]:
data = pd.concat([
    pd.read_json("my_spotify_data/Streaming_History_Audio_2019-2020_0.json"),
    pd.read_json("my_spotify_data/Streaming_History_Audio_2020-2022_1.json"),
    pd.read_json("my_spotify_data/Streaming_History_Audio_2022-2024_2.json"),
    pd.read_json("my_spotify_data/Streaming_History_Audio_2024-2025_3.json")
])

In [9]:
data['track_id'] = data['spotify_track_uri'].str.split(':').str[-1]

In [10]:
columns = [
    'ts',
    'master_metadata_track_name',
    'master_metadata_album_artist_name',
    'master_metadata_album_album_name',
    'ms_played',
    'reason_start',
    'reason_end',
    'skipped',
    'track_id'
]

In [11]:
df = data.copy(deep=True)
# Only include music streams
# This means removing streams without a spotify_track_uri
df = df[~df['spotify_track_uri'].isnull()].reset_index(drop=True)
# Only use the columns that we need
df = df[columns]

In [12]:
df.to_csv("streams_data.csv", index=False)

# Get data for all tracks

In [13]:
df_track_ids = data[['track_id']].copy(deep=True)
track_ids = df_track_ids['track_id'].dropna().unique().tolist()

In [14]:
# Use batches to minimize API calls
df_tracks = list()

for i in range(0, 50, 50):
    track_ids_batch = track_ids[i:i+50]
    df_tracks_temp = spotify_extractor.get_tracks(track_ids=track_ids_batch)
    df_tracks_temp = pd.DataFrame(df_tracks_temp)
    df_tracks_temp['track_id'] = track_ids_batch
    df_tracks.append(df_tracks_temp)
    time.sleep(1)

df_tracks = pd.concat(df_tracks, ignore_index=True)
df_tracks.head()

Unnamed: 0,album_id,duration_ms,track_popularity,track_id
0,677BXv3yxd2fhiekObHHiR,181520,42,2Qt3volK31SIYIgk4gfJdT
1,22F5ZYY1sxoJjk6HzZfmC1,210626,76,2EBCVPNAG46nbgs6jXPGvv
2,3pLdWdkj83EYfDN6H2N8MR,130613,50,5qN4HFkapdAOV94XPryVof
3,3pLdWdkj83EYfDN6H2N8MR,208760,52,0DJBgBiYeSn6n1AXAkFVE8
4,3pLdWdkj83EYfDN6H2N8MR,225266,57,1eLSF6HfrRA0AsNmTkUlKx


In [15]:
df_tracks.set_index('track_id').reset_index().to_csv("tracks_data.csv", index=False)

# Get data for all albums

In [16]:
album_ids = df_tracks['album_id'].unique().tolist()

In [17]:
# Use batches to minimize API calls
df_albums = list()

for i in range(0, 50, 20):
    album_ids_batch = album_ids[i:i+20]
    df_albums_temp = spotify_extractor.get_albums(album_ids=album_ids_batch)
    df_albums_temp = pd.DataFrame(df_albums_temp)
    df_albums_temp['album_id'] = album_ids_batch
    df_albums.append(df_albums_temp)
    time.sleep(1)

df_albums = pd.concat(df_albums, ignore_index=True)
df_albums.head()

Unnamed: 0,release_date,release_date_precision,album_popularity,image_url,album_id
0,2011-10-05,day,32.0,https://i.scdn.co/image/ab67616d00001e02d72168...,677BXv3yxd2fhiekObHHiR
1,2012-01-01,day,66.0,https://i.scdn.co/image/ab67616d00001e02ef9ad6...,22F5ZYY1sxoJjk6HzZfmC1
2,2018-02-09,day,76.0,https://i.scdn.co/image/ab67616d00001e02c027ad...,3pLdWdkj83EYfDN6H2N8MR
3,2012-10-22,day,67.0,https://i.scdn.co/image/ab67616d00001e0278de8b...,748dZDqSZy6aPXKcI9H80u
4,2018-05-06,day,53.0,https://i.scdn.co/image/ab67616d00001e02350466...,7arx9qPJexCsDz67El4qvk


In [18]:
df_albums.set_index('track_id').reset_index().to_csv("albums_data.csv", index=False)