In [None]:
import os
import json
import numpy as np
import pandas as pd

# get all files in MyData folder
files = os.listdir('MyData')

# get all files with .json extension
files = [f for f in files if f[-5:] == '.json']

# parse each file, making big json string
stream = []
for f in files:
    with open('MyData/' + f, 'r') as file:
        data = json.load(file)
        stream.extend(data)

# convert string to dataframe
df = pd.DataFrame.from_records(stream)

In [84]:
# print all columns in the dataframe
print(df.columns)

Index(['ts', 'username', 'platform', 'ms_played', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted',
       'master_metadata_track_name', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'spotify_track_uri', 'episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode'],
      dtype='object')


In [None]:
# sum ms_played for each row
total_ms_played = df['ms_played'].sum()
print('Total time listened: {} days, {} hours, {} minutes, {} seconds'.format(
    int(total_ms_played / 1000 / 60 / 60 / 24),
    int(total_ms_played / 1000 / 60 / 60 % 24),
    int(total_ms_played / 1000 / 60 % 60),
    int(total_ms_played / 1000 % 60)
))

In [None]:

# create a new dataframe with the total number of milliseconds played for each song, and the counts each song occured
# then sort the dataframe by the total number of milliseconds played
group_by_song = df.groupby(['master_metadata_track_name', 'master_metadata_album_artist_name']).agg({'ms_played': ['sum', 'count']})
group_by_song = group_by_song.reset_index()
group_by_song.columns = ['song_name', 'artist_name', 'time_played', 'count']
group_by_song = group_by_song.sort_values(by=['time_played'], ascending=False)
group_by_song['time_played'] = group_by_song['time_played'].apply(lambda x: str(str(int(x / 3600000))) + "h " + str(int((x % 3600000) / 60000)) + "m " + str(int((x % 60000) / 1000)) + "s")
group_by_song.head(50)

In [None]:
# show row of group_by_song where "song_name" equals "The Less I Know The Better"
top_pink_floyds = group_by_song[group_by_song['artist_name'] == 'Pink Floyd']
top_pink_floyds.head(25)