In [2]:
import pandas as pd
import numpy as np
import re
import json
import os
import requests
import random
from matplotlib import pyplot as plt

## <span style="color: #95a5a6"> Read Data</span>

In [50]:
with open('data/account_data/SearchQueries.json', encoding='utf-8') as f:
    searches_json = json.load(f)

In [52]:
search_data = pd.DataFrame(searches_json)

## <span style="color: #95a5a6"> Prep</span>

In [60]:
search_data['searchTime'] = pd.to_datetime(search_data['searchTime'], format='%Y-%m-%dT%H:%M:%S.%fZ[UTC]')

## <span style="color: #95a5a6">Output</span>

In [61]:
search_data.to_csv('output/search_data.csv', index=False)

In [63]:
search_data.head()

Unnamed: 0,platform,searchTime,searchQuery,searchInteractionURIs
0,IPHONE_ARM64,2024-04-08 13:16:10.907,u,[]
1,IPHONE_ARM64,2024-04-08 13:16:10.937,ultr,[]
2,IPHONE,2024-04-08 13:16:29.923,ultrasound split,[spotify:search:ultrasound+test]
3,IPHONE_ARM64,2024-04-08 13:17:29.956,milk,[]
4,IPHONE,2024-04-08 13:17:30.640,milk kan,[spotify:artist:5zfqoDmcjGb85SgSFPg9sk]


## <span style="color: #95a5a6"> Read Data</span>

In [4]:
streaming_files = [f for f in os.listdir('data') if '.json' in f]
streaming_data_list = []
for file_name in streaming_files:
    with open(f'data/{file_name}', encoding="utf8") as f:
        json_data = json.load(f)
    streaming_data_list.extend(json_data)

In [6]:
streaming_data = pd.DataFrame(streaming_data_list)
streaming_data.shape

(193442, 19)

In [7]:
streaming_data['track_id'] = streaming_data['spotify_track_uri'].str.split(':').str[-1]

In [6]:
pd.set_option('display.max_columns', None)

## <span style="color: #95a5a6">Prep</span>

In [8]:
# convert ts to date
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])
# remove incomplete month
streaming_data['y'] = streaming_data['ts'].dt.year
streaming_data['m'] = streaming_data['ts'].dt.month
incomplete_start_month = streaming_data[(streaming_data['y'] == 2015) & (streaming_data['m'] == 3)].index
incomplete_end_month = streaming_data[(streaming_data['y'] == 2024) & (streaming_data['m'] == 7)].index
streaming_data.drop(index=incomplete_start_month, inplace=True)
streaming_data.drop(index=incomplete_end_month, inplace=True)
streaming_data.reset_index(drop=True, inplace=True)

In [9]:
# convert ms to seconds and minutes
streaming_data['seconds_played'] = streaming_data['ms_played'] / 1_000
streaming_data['minutes_played'] = streaming_data['seconds_played'] / 60

In [10]:
# rename columns
col_map = {
    'master_metadata_track_name' : 'track_name',
    'master_metadata_album_artist_name' : 'artist_name',
    'master_metadata_album_album_name' : 'album_name',
}
streaming_data.rename(columns=col_map, inplace=True)

#### <span style="color: #95a5a6">Generate sessions from streaming data</span>

In [409]:
# sort by datetime
streaming_data.sort_values('ts', inplace=True)
streaming_data.reset_index(drop=True, inplace=True)
# group data by time diff
streaming_data['ts_curr'] = streaming_data['ts']
streaming_data['ts_above'] = streaming_data['ts'].shift(1)
streaming_data['ts_diff'] = streaming_data['ts'] - streaming_data['ts_above']
streaming_data['ts_diff_seconds'] = streaming_data['ts_diff'].apply(lambda x: x.seconds)
streaming_data['ts_diff_mins'] = streaming_data['ts_diff_seconds'] / 60
# get session breakpoints
session_breakpoints = streaming_data[streaming_data['ts_diff_seconds'] > 3_600] # over a hour
streaming_data.loc[session_breakpoints.index, 'breakpoint'] = True
streaming_data['breakpoint'] = streaming_data['breakpoint'].fillna(False)

In [410]:
# iterate through breakpoints recording session start and end indicies
start_end_pairs = []
sessions_dict = {}
for i, idx in enumerate(session_breakpoints.index):
    # get session starting index
    if i == 0:
        start_idx = 0
    else:
        start_idx = session_breakpoints.index[i-1]
    # get session ending index
    if (i+1) == len(session_breakpoints):
        end_idx = streaming_data.index[-1]
    else:
        end_idx = idx
    # testing
    start_end_pairs.append((start_idx, end_idx))
    # get session rows
    session_data = streaming_data.iloc[start_idx:end_idx].copy()
    sessions_dict[i] = session_data

In [411]:
sessions_concat = pd.concat(sessions_dict.values(), keys=sessions_dict.keys())
session_data = sessions_concat.reset_index(level=0).rename(columns={'level_0':'session_number'})
session_data.reset_index(drop=True, inplace=True)

#### <span style="color: #95a5a6">Output Session Data</span>

In [412]:
streaming_data.shape

(184250, 32)

In [413]:
session_data.shape

(184249, 33)

In [414]:
session_data.to_csv('output/session_data.csv', index=False)

In [415]:
session_data

Unnamed: 0,session_number,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,track_name,artist_name,album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,track_id,y,m,ts_curr,ts_above,ts_diff,ts_diff_seconds,breakpoint,ts_diff_mins,seconds_played,minutes_played
0,0,2015-05-02 15:52:01+00:00,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),79980,GB,86.25.60.179,unknown,Lego House,Ed Sheeran,+,spotify:track:5ubHAQtKuFfiG4FXfLP804,,,,clickrow,endplay,False,True,False,0.000000e+00,False,5ubHAQtKuFfiG4FXfLP804,2015,5,2015-05-02 15:52:01+00:00,NaT,NaT,,False,,79.980,1.333000
1,0,2015-05-02 15:52:35+00:00,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),35390,GB,86.25.60.179,unknown,"Lose Yourself - From ""8 Mile"" Soundtrack",Eminem,Curtain Call,spotify:track:7w9bgPAmPTtrkt2v16QWvQ,,,,clickrow,endplay,False,True,False,0.000000e+00,False,7w9bgPAmPTtrkt2v16QWvQ,2015,5,2015-05-02 15:52:35+00:00,2015-05-02 15:52:01+00:00,0 days 00:00:34,34.0,False,0.566667,35.390,0.589833
2,0,2015-05-02 15:53:52+00:00,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),77159,GB,86.25.60.179,unknown,Hello,Eminem,Relapse: Refill,spotify:track:2Dr76N76UX0xtZoLvwe3WY,,,,clickrow,fwdbtn,False,True,False,0.000000e+00,False,2Dr76N76UX0xtZoLvwe3WY,2015,5,2015-05-02 15:53:52+00:00,2015-05-02 15:52:35+00:00,0 days 00:01:17,77.0,False,1.283333,77.159,1.285983
3,0,2015-05-02 15:53:54+00:00,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),2786,GB,86.25.60.179,unknown,Stan,Eminem,Curtain Call,spotify:track:1h45qrpHTHLlbCGSr8QQEA,,,,fwdbtn,fwdbtn,True,True,False,0.000000e+00,False,1h45qrpHTHLlbCGSr8QQEA,2015,5,2015-05-02 15:53:54+00:00,2015-05-02 15:53:52+00:00,0 days 00:00:02,2.0,False,0.033333,2.786,0.046433
4,0,2015-05-02 15:53:55+00:00,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),2043,GB,86.25.60.179,unknown,Never Enough,Eminem,Encore,spotify:track:3uBeTrCu8B31thODnDjcat,,,,fwdbtn,fwdbtn,True,True,False,0.000000e+00,False,3uBeTrCu8B31thODnDjcat,2015,5,2015-05-02 15:53:55+00:00,2015-05-02 15:53:54+00:00,0 days 00:00:01,1.0,False,0.016667,2.043,0.034050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184244,6887,2024-06-28 21:02:55+00:00,+==hi==+,ios,1416,GB,2a02:c7c:5271:c300:4160:841c:4e78:256d,,Jigsaw Falling Into Place,Radiohead,In Rainbows,spotify:track:0YJ9FWWHn9EfnN0lHwbzvV,,,,playbtn,fwdbtn,True,True,False,1.719609e+09,False,0YJ9FWWHn9EfnN0lHwbzvV,2024,6,2024-06-28 21:02:55+00:00,2024-06-28 21:02:45+00:00,0 days 00:00:10,10.0,False,0.166667,1.416,0.023600
184245,6887,2024-06-28 21:02:58+00:00,+==hi==+,ios,534,GB,2a02:c7c:5271:c300:4160:841c:4e78:256d,,12:51,The Strokes,Room On Fire,spotify:track:6Yu1OL8I0D4vjOzYdsXYGK,,,,fwdbtn,backbtn,True,True,False,1.719609e+09,False,6Yu1OL8I0D4vjOzYdsXYGK,2024,6,2024-06-28 21:02:58+00:00,2024-06-28 21:02:55+00:00,0 days 00:00:03,3.0,False,0.050000,0.534,0.008900
184246,6887,2024-06-28 21:02:58+00:00,+==hi==+,ios,1996,GB,2a02:c7c:5271:c300:4160:841c:4e78:256d,,Do 2 Me (feat. Anderson .Paak & SiR),KAYTRANADA,TIMELESS,spotify:track:5J75NvHDMTbJtLqym6CXuI,,,,fwdbtn,fwdbtn,True,True,False,1.719609e+09,False,5J75NvHDMTbJtLqym6CXuI,2024,6,2024-06-28 21:02:58+00:00,2024-06-28 21:02:58+00:00,0 days 00:00:00,0.0,False,0.000000,1.996,0.033267
184247,6887,2024-06-28 21:03:07+00:00,+==hi==+,ios,8382,GB,2a02:c7c:5271:c300:4160:841c:4e78:256d,,Jigsaw Falling Into Place,Radiohead,In Rainbows,spotify:track:0YJ9FWWHn9EfnN0lHwbzvV,,,,backbtn,fwdbtn,True,True,False,1.719609e+09,False,0YJ9FWWHn9EfnN0lHwbzvV,2024,6,2024-06-28 21:03:07+00:00,2024-06-28 21:02:58+00:00,0 days 00:00:09,9.0,False,0.150000,8.382,0.139700


In [68]:
streaming_data.head(2)

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,y,m
0,2015-05-02T15:52:01Z,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),79980,GB,86.25.60.179,unknown,Lego House,Ed Sheeran,+,spotify:track:5ubHAQtKuFfiG4FXfLP804,,,,clickrow,endplay,False,True,False,0.0,False,2015,5
1,2015-05-02T15:52:35Z,+==hi==+,Windows 7 (6.1.7601; x64; SP1; S),35390,GB,86.25.60.179,unknown,"Lose Yourself - From ""8 Mile"" Soundtrack",Eminem,Curtain Call,spotify:track:7w9bgPAmPTtrkt2v16QWvQ,,,,clickrow,endplay,False,True,False,0.0,False,2015,5


#### <span style="color: #95a5a6">Spotify API Credentials</span>

In [170]:
client_id = 'e548589a0f7c43d8a546de7f459b70d9'
client_secret = '0bbfaca1e8e949959c5856cab53b97fe'
auth_url = 'https://accounts.spotify.com/api/token'
auth_response = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
})
auth_response_data = auth_response.json()
access_token = auth_response_data['access_token']
# define headers using access token
headers = {
    'Authorization': f'Bearer {access_token}'
}

In [179]:
auth_response

<Response [200]>

In [180]:
auth_response.content

b'{"access_token":"BQCK9Hcm-Yikyd4dUbL5YisZfU1e3g4sTtJhmWUiC4XFYwgAKZQ9TkqBWW-53vs4ctbJfCv9NpXH1W2qdWtb_M24MLPn87w7Dql_BtMUp2agtYqDuZw","token_type":"Bearer","expires_in":3600}'

In [207]:
# testing
test_id = '5ubHAQtKuFfiG4FXfLP804'
test_url = f"https://api.spotify.com/v1/audio_features/{test_id}"
test_response = requests.get(test_url, headers=headers)

In [208]:
test_response.content

b'{\n  "error": {\n    "status": 404,\n    "message": "Service not found"\n  }\n}'

#### <span style="color: #95a5a6">Get Track Audio Features</span>

In [188]:
# get distinct tracks
distinct_tracks = streaming_data.drop_duplicates(['track_name'])
distinct_track_ids = distinct_tracks['spotify_track_uri'].str.split(':').str[-1]
distinct_track_ids.dropna(inplace=True)

In [206]:
distinct_track_ids

0         5ubHAQtKuFfiG4FXfLP804
1         7w9bgPAmPTtrkt2v16QWvQ
2         2Dr76N76UX0xtZoLvwe3WY
3         1h45qrpHTHLlbCGSr8QQEA
4         3uBeTrCu8B31thODnDjcat
                   ...          
184089    7JDtr9BcPgqYyDla3G7DI2
184110    1HEfXDxLCuIAOvNkYMK9pC
184112    3WF3B7Egz8HxQ1lVVJMh7u
184126    2sITbbWIOeg2Lwp4WN2jqr
184241    3NbQIxqkq36cy5a8Ub9vZ0
Name: spotify_track_uri, Length: 19085, dtype: object

In [201]:
# define endopint
url = f"https://api.spotify.com/v1/audio-features?"
# loop over chunks of 100 track ids
for start in range(0, len(distinct_track_ids), 100):
    end = start+100
    print(start, start+100)
    chunk_ids = distinct_track_ids.iloc[start:5].tolist()
    # headers['ids'] = ','.join(chunk_ids)
    params = {'ids':','.join(chunk_ids)}
    # format url
    # url_formatted = url + 'ids=' + '5ubHAQtKuFfiG4FXfLP804%2C7w9bgPAmPTtrkt2v16QWvQ'
    # get artist data
    response = requests.get(url, params=params, headers=headers)
    json_content = response.json()
    break

0 100


In [202]:
params

{'ids': '5ubHAQtKuFfiG4FXfLP804,7w9bgPAmPTtrkt2v16QWvQ,2Dr76N76UX0xtZoLvwe3WY,1h45qrpHTHLlbCGSr8QQEA,3uBeTrCu8B31thODnDjcat'}

In [203]:
response.url

'https://api.spotify.com/v1/audio-features?ids=5ubHAQtKuFfiG4FXfLP804%2C7w9bgPAmPTtrkt2v16QWvQ%2C2Dr76N76UX0xtZoLvwe3WY%2C1h45qrpHTHLlbCGSr8QQEA%2C3uBeTrCu8B31thODnDjcat'

In [204]:
response

<Response [403]>

In [205]:
json_content

{'error': {'status': 403}}

In [214]:
features = pd.read_csv('data/tracks_features.csv')
features.shape

(1204025, 24)

In [223]:
# get distinct tracks
distinct_track_ids = streaming_data['track_id'].unique().tolist()

In [225]:
# check viability
not_found = set(distinct_track_ids) - set(features['id'].tolist())
len(not_found)

16789

In [230]:
streaming_data[streaming_data['track_id'].isin(features['id'].tolist())]['master_metadata_track_name'].nunique()

2298

## <span style="color: #95a5a6">Output</span>

In [11]:
streaming_data.to_csv('output/streaming_history.csv', index=False)