In [1]:
import pandas as pd
import numpy as np
import re
import json
import os
import requests
import random
from matplotlib import pyplot as plt

## <span style="color: #95a5a6"> Read Data</span>

In [3]:
streaming_files = [f for f in os.listdir('data') if '.json' in f]
streaming_data_list = []
for file_name in streaming_files:
    with open(f'data/{file_name}', encoding="utf8") as f:
        json_data = json.load(f)
    streaming_data_list.extend(json_data)

In [4]:
streaming_data = pd.DataFrame(streaming_data_list)
streaming_data.shape

(16859, 21)

In [5]:
streaming_data['track_id'] = streaming_data['spotify_track_uri'].str.split(':').str[-1]

In [6]:
pd.set_option('display.max_columns', None)

## <span style="color: #95a5a6">Prep</span>

In [7]:
# convert ts to date
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])
# remove incomplete month
streaming_data['y'] = streaming_data['ts'].dt.year
streaming_data['m'] = streaming_data['ts'].dt.month
incomplete_start_month = streaming_data[(streaming_data['y'] == 2015) & (streaming_data['m'] == 3)].index
incomplete_end_month = streaming_data[(streaming_data['y'] == 2024) & (streaming_data['m'] == 7)].index
streaming_data.drop(index=incomplete_start_month, inplace=True)
streaming_data.drop(index=incomplete_end_month, inplace=True)
streaming_data.reset_index(drop=True, inplace=True)

In [8]:
# convert ms to seconds and minutes
streaming_data['seconds_played'] = streaming_data['ms_played'] / 1_000
streaming_data['minutes_played'] = streaming_data['seconds_played'] / 60

In [9]:
# rename columns
col_map = {
    'master_metadata_track_name' : 'track_name',
    'master_metadata_album_artist_name' : 'artist_name',
    'master_metadata_album_album_name' : 'album_name',
}
streaming_data.rename(columns=col_map, inplace=True)

In [188]:
# get distinct tracks
distinct_tracks = streaming_data.drop_duplicates(['track_name'])
distinct_track_ids = distinct_tracks['spotify_track_uri'].str.split(':').str[-1]
distinct_track_ids.dropna(inplace=True)

In [206]:
distinct_track_ids

0         5ubHAQtKuFfiG4FXfLP804
1         7w9bgPAmPTtrkt2v16QWvQ
2         2Dr76N76UX0xtZoLvwe3WY
3         1h45qrpHTHLlbCGSr8QQEA
4         3uBeTrCu8B31thODnDjcat
                   ...          
184089    7JDtr9BcPgqYyDla3G7DI2
184110    1HEfXDxLCuIAOvNkYMK9pC
184112    3WF3B7Egz8HxQ1lVVJMh7u
184126    2sITbbWIOeg2Lwp4WN2jqr
184241    3NbQIxqkq36cy5a8Ub9vZ0
Name: spotify_track_uri, Length: 19085, dtype: object

In [201]:
# define endopint
url = f"https://api.spotify.com/v1/audio-features?"
# loop over chunks of 100 track ids
for start in range(0, len(distinct_track_ids), 100):
    end = start+100
    print(start, start+100)
    chunk_ids = distinct_track_ids.iloc[start:5].tolist()
    # headers['ids'] = ','.join(chunk_ids)
    params = {'ids':','.join(chunk_ids)}
    # format url
    # url_formatted = url + 'ids=' + '5ubHAQtKuFfiG4FXfLP804%2C7w9bgPAmPTtrkt2v16QWvQ'
    # get artist data
    response = requests.get(url, params=params, headers=headers)
    json_content = response.json()
    break

0 100


In [202]:
params

{'ids': '5ubHAQtKuFfiG4FXfLP804,7w9bgPAmPTtrkt2v16QWvQ,2Dr76N76UX0xtZoLvwe3WY,1h45qrpHTHLlbCGSr8QQEA,3uBeTrCu8B31thODnDjcat'}

In [203]:
response.url

'https://api.spotify.com/v1/audio-features?ids=5ubHAQtKuFfiG4FXfLP804%2C7w9bgPAmPTtrkt2v16QWvQ%2C2Dr76N76UX0xtZoLvwe3WY%2C1h45qrpHTHLlbCGSr8QQEA%2C3uBeTrCu8B31thODnDjcat'

In [204]:
response

<Response [403]>

In [205]:
json_content

{'error': {'status': 403}}

In [214]:
features = pd.read_csv('data/tracks_features.csv')
features.shape

(1204025, 24)

In [223]:
# get distinct tracks
distinct_track_ids = streaming_data['track_id'].unique().tolist()

In [225]:
# check viability
not_found = set(distinct_track_ids) - set(features['id'].tolist())
len(not_found)

16789

In [230]:
streaming_data[streaming_data['track_id'].isin(features['id'].tolist())]['master_metadata_track_name'].nunique()

2298

## <span style="color: #95a5a6">Output</span>

In [11]:
streaming_data.to_csv('output/streaming_history.csv', index=False)