In [11]:
import pandas as pd
import numpy as np
import re
import json
import os
import requests
import random
from matplotlib import pyplot as plt

## <span style="color: #95a5a6"> Read Data</span>

In [12]:
streaming_files = [f for f in os.listdir('data') if '.json' in f]
streaming_data_list = []
for file_name in streaming_files:
    with open(f'data/{file_name}', encoding="utf8") as f:
        json_data = json.load(f)
    streaming_data_list.extend(json_data)

In [13]:
streaming_data = pd.DataFrame(streaming_data_list)
streaming_data.shape

(17241, 21)

In [14]:
streaming_data['track_id'] = streaming_data['spotify_track_uri'].str.split(':').str[-1]

In [15]:
pd.set_option('display.max_columns', None)

## <span style="color: #95a5a6">Prep</span>

In [16]:
# convert ts to date
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])
# remove incomplete month
streaming_data['y'] = streaming_data['ts'].dt.year
streaming_data['m'] = streaming_data['ts'].dt.month
incomplete_start_month = streaming_data[(streaming_data['y'] == 2015) & (streaming_data['m'] == 3)].index
incomplete_end_month = streaming_data[(streaming_data['y'] == 2024) & (streaming_data['m'] == 7)].index
streaming_data.drop(index=incomplete_start_month, inplace=True)
streaming_data.drop(index=incomplete_end_month, inplace=True)
streaming_data.reset_index(drop=True, inplace=True)

In [17]:
# convert ms to seconds and minutes
streaming_data['seconds_played'] = streaming_data['ms_played'] / 1_000
streaming_data['minutes_played'] = streaming_data['seconds_played'] / 60

In [18]:
# rename columns
col_map = {
    'master_metadata_track_name' : 'track_name',
    'master_metadata_album_artist_name' : 'artist_name',
    'master_metadata_album_album_name' : 'album_name',
}
streaming_data.rename(columns=col_map, inplace=True)

## <span style="color: #95a5a6">Output</span>

In [11]:
streaming_data.to_csv('output/streaming_history.csv', index=False)