## Convert JSON Objects into Data Frames

The data was collected by CURLing the requests in my own terminal using the [Spotify Web API](https://developer.spotify.com/documentation/web-api/reference/):

### Read JSON Files

In [27]:
import json

In [40]:
time_range = ['short_term', 'medium_term', 'long_term']

##### Tracks

In [42]:
track_data_json = {}
for term in time_range:
    with open(f"data/top_tracks_{term}.json") as json_file:
        data = json.load(json_file)
    key = f'{term}'
    track_data_json[key] = data

##### Artists

In [48]:
artists_data_json = {}
for term in time_range:
    with open(f"data/top_artists_{term}.json") as json_file:
        data = json.load(json_file)
    key = f'{term}'
    artists_data_json[key] = data

### Organize Artists and Tracks Data

Turn the dictionaries into csv and pandas dataframe

In [53]:
for term in time_range:
    data = artists_data_json[term]
    df = pd.DataFrame(data['items'])
    df.to_csv(f'data/top_artists_{term}.csv')

In [57]:
for term in time_range:
    data = track_data_json[term]
    df = pd.DataFrame(data['items'])
    df.to_csv(f'data/top_tracks_{term}.csv')

In [58]:
top_artists_short_term_df = pd.read_csv('data/top_artists_short_term.csv')
top_artists_medium_term_df = pd.read_csv('data/top_artists_medium_term.csv')
top_artists_long_term_df = pd.read_csv('data/top_artists_long_term.csv')

In [59]:
top_tracks_short_term_df = pd.read_csv('data/top_tracks_short_term.csv')
top_tracks_medium_term_df = pd.read_csv('data/top_tracks_medium_term.csv')
top_tracks_long_term_df = pd.read_csv('data/top_tracks_long_term.csv')

### Keep Popularity and Genres for Artists

In [134]:
artist_features = ['genres', 'name', 'popularity', 'id']

In [135]:
top_artists_short_term_reduced = top_artists_short_term_df[artist_features]
top_artists_medium_term_reduced = top_artists_medium_term_df[artist_features]
top_artists_long_term_reduced = top_artists_long_term_df[artist_features]

In [138]:
top_artists_short_term_reduced.to_csv('data/artists_short_term_reduced.csv')
top_artists_medium_term_reduced.to_csv('data/artists_medium_term_reduced.csv')
top_artists_long_term_reduced.to_csv('data/artists_long_term_reduced.csv')

In [215]:
top_artists_short_term_reduced

Unnamed: 0,genres,name,popularity,id
0,"['contemporary jazz', 'contemporary post-bop',...",Brad Mehldau,55,2vI9KFm0fwSfPrpEgOeIbq
1,"['contemporary jazz', 'jazz', 'jazz drums', 's...",Ari Hoenig,39,1P6Llrp12ldpVbyC8gCHBz
2,"['contemporary jazz', 'contemporary post-bop',...",Kurt Rosenwinkel,41,253GMpCNwx1TJtASNAeDoP
3,"['contemporary jazz', 'straight-ahead jazz']",Dayna Stephens,16,3Y8rffZJZVJgNWMM6ZVGin
4,"['austrian orchestra', 'classical', 'classical...",Wiener Philharmoniker,70,003f4bk13c6Q3gAUXv7dGJ
5,"['contemporary jazz', 'jazz saxophone']",Mark Turner,31,36kfddkWcVc6XrzNN9BsTP
6,"['modern folk rock', 'modern rock', 'pop rock'...",Mumford & Sons,75,3gd8FJtBJtkRxdfbTu19U2
7,"['neo mellow', 'pop rock', 'singer-songwriter']",John Mayer,82,0hEurMDQu99nJRq8pTxO14
8,"['hip hop', 'pittsburgh rap', 'rap']",Mac Miller,87,4LLpKhyESsyAXpc4laK94U
9,[],Noam Wiesenberg,3,0IV9EI5sd2rlMEoAvDg70M


### Get Musical Features from Track IDs

We'll be using spotipy's library to do this

In [64]:
import spotipy
from config import get_spotipy_client

In [75]:
sp = get_spotipy_client()

In [82]:
short_term_audio_features = sp.audio_features(tracks=list(top_tracks_short_term_df['id']))
medium_term_audio_features = sp.audio_features(tracks=list(top_tracks_medium_term_df['id']))
long_term_audio_features = sp.audio_features(tracks=list(top_tracks_long_term_df['id']))

In [84]:
short_term_audio_features_df = pd.DataFrame(short_term_audio_features)
medium_term_audio_features_df = pd.DataFrame(medium_term_audio_features)
long_term_audio_features_df = pd.DataFrame(long_term_audio_features)

#### Add name and release_date column for audio features

In [200]:
import ast

In [88]:
short_term_audio_features_df['name'] = top_tracks_short_term_df['name']
medium_term_audio_features_df['name'] = top_tracks_medium_term_df['name']
long_term_audio_features_df['name'] = top_tracks_long_term_df['name']

In [196]:
release_dates = []
for i in top_tracks_short_term_df['album']:
    date = ast.literal_eval(i)['release_date']
    release_dates.append(date) 
short_term_audio_features_df['release_date'] = release_dates

In [201]:
release_dates = []
for i in top_tracks_medium_term_df['album']:
    date = ast.literal_eval(i)['release_date']
    release_dates.append(date) 
medium_term_audio_features_df['release_date'] = release_dates

In [202]:
release_dates = []
for i in top_tracks_long_term_df['album']:
    date = ast.literal_eval(i)['release_date']
    release_dates.append(date) 
long_term_audio_features_df['release_date'] = release_dates

In [203]:
short_term_audio_features_df.to_csv('data/audio_features_short.csv')
medium_term_audio_features_df.to_csv('data/audio_features_medium.csv')
long_term_audio_features_df.to_csv('data/audio_features_long.csv')

#### All Musical Features
Remove duplicates

In [204]:
all_audio_features_df = pd.concat([short_term_audio_features_df, medium_term_audio_features_df, long_term_audio_features_df])

In [205]:
all_audio_features_df = all_audio_features_df.drop_duplicates(subset='name')

#### Get Averages of Features

Do this for each time ranges

In [206]:
audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'tempo', 'liveness', 'valence']

In [207]:
audio_features_averages_short = short_term_audio_features_df[audio_features].mean()
audio_features_averages_medium = medium_term_audio_features_df[audio_features].mean()
audio_features_averages_long = long_term_audio_features_df[audio_features].mean()
audio_features_averages_all = all_audio_features_df[audio_features].mean()

In [208]:
audio_features_averages_df = pd.DataFrame([dict(audio_features_averages_short), dict(audio_features_averages_medium), dict(audio_features_averages_long), dict(audio_features_averages_all)])
audio_features_averages_df['time_frame'] = ['short_term', 'medium_term', 'long_term', 'all']

In [209]:
audio_features_averages_df.to_csv('data/audio_features_averages.csv')

### Data So Far...

Now we have audio features for each track, the averages of those features over time and in total, track release dates, my top artists that I listen to, the artist's popularity index, and the genres of the artist's

### Some More Cleaning

#### Grouped Dot Viz
Need top genres (colour), artists (dots), separated by time range. This is for the d3.js visualizations

In [221]:
top_genres = ['jazz', 'rock', 'classical', 'pop', 'rap', 'hip hop', 'pop rock', 'r&b', 'electro', 'tibetan traditional']

In [245]:
current_genres = []
for i in top_artists_short_term_reduced['genres'].values:
    op = i.strip("[").strip("]").replace("'","").split(", ")
    genre = ""
    for q in op:
        if q in top_genres:
            genre = q
            break
    current_genres.append(genre)
top_artists_short_term_reduced['genre'] = current_genres

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_artists_short_term_reduced['genre'] = current_genres


In [247]:
current_genres = []
for i in top_artists_medium_term_reduced['genres'].values:
    op = i.strip("[").strip("]").replace("'","").split(", ")
    genre = ""
    for q in op:
        if q in top_genres:
            genre = q
            break
    current_genres.append(genre)
top_artists_medium_term_reduced['genre'] = current_genres

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_artists_medium_term_reduced['genre'] = current_genres


In [248]:
current_genres = []
for i in top_artists_long_term_reduced['genres'].values:
    op = i.strip("[").strip("]").replace("'","").split(", ")
    genre = ""
    for q in op:
        if q in top_genres:
            genre = q
            break
    current_genres.append(genre)
top_artists_long_term_reduced['genre'] = current_genres

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_artists_long_term_reduced['genre'] = current_genres


In [251]:
top_artists_short_term_reduced.to_csv('data/artists_short_term_reduced.csv')
top_artists_medium_term_reduced.to_csv('data/artists_medium_term_reduced.csv')
top_artists_long_term_reduced.to_csv('data/artists_long_term_reduced.csv')

### For Musical Features

In [268]:
df = []
for i, q in audio_features_averages_all.items():
    df.append([i,q])

In [270]:
features_all = pd.DataFrame(df, columns=['feature', 'percentage'])

In [273]:
features_all.to_csv('data/avg_all_musical_features_organized.csv',index=False)

In [277]:
keep = ['time_frame', 'valence', 'liveness', 'danceability','energy','speechiness','acousticness', 'instrumentalness']

In [278]:
audio_features_averages_df = audio_features_averages_df[keep]

In [282]:
audio_features_averages_df = audio_features_averages_df[audio_features_averages_df['time_frame'] != "all"]

In [284]:
audio_features_averages_df.to_csv('data/audio_features_over_time.csv',index=False)

### Genre's

In [287]:
artist_genre_data = pd.read_csv('data/artists_long_term_reduced_edit.csv')

In [289]:
artist_genre_data = artist_genre_data[['name', 'genre']]

In [291]:
artist_genre_data.to_csv('data/artist_genre_viz.csv',index=False)

In [294]:
artist_genre_data.genre.unique()

array(['jazz', 'rock', 'classical', 'hip hop', 'pop', 'tibetan',
       'electro'], dtype=object)

In [297]:
artist_genre_data.to_dict('records')

[{'name': 'Brad Mehldau', 'genre': 'jazz'},
 {'name': 'Shai Maestro', 'genre': 'jazz'},
 {'name': 'Walter Smith III', 'genre': 'jazz'},
 {'name': 'Aaron Parks', 'genre': 'jazz'},
 {'name': 'Queen', 'genre': 'rock'},
 {'name': 'Khatia Buniatishvili', 'genre': 'classical'},
 {'name': 'Drake', 'genre': 'hip hop'},
 {'name': 'Kendrick Scott Oracle', 'genre': 'jazz'},
 {'name': 'Kurt Rosenwinkel', 'genre': 'jazz'},
 {'name': 'Daniel Caesar', 'genre': 'pop'},
 {'name': 'Ambrose Akinmusire', 'genre': 'jazz'},
 {'name': 'Billie Eilish', 'genre': 'pop'},
 {'name': 'Ben Wendel', 'genre': 'jazz'},
 {'name': 'Brad Mehldau Trio', 'genre': 'jazz'},
 {'name': 'Chet Baker', 'genre': 'jazz'},
 {'name': 'Kanye West', 'genre': 'hip hop'},
 {'name': 'Amine', 'genre': 'hip hop'},
 {'name': 'Berliner Philharmoniker', 'genre': 'classical'},
 {'name': 'Jacob Mann', 'genre': 'jazz'},
 {'name': 'Omar Apollo', 'genre': 'pop'},
 {'name': 'Tibetan Institute Of Performing Arts', 'genre': 'tibetan'},
 {'name': 'Tigr