In [1]:
import spotipy, json, os
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import numpy as np
import time

from tqdm import tqdm

In [4]:
os.environ['SPOTIPY_CLIENT_SECRET'] = '4ba6ced99c9d4f95be4567d28b4e5a65'
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://localhost:8008/callback'
os.environ['SPOTIPY_CLIENT_ID'] = '7fd6a4b022fc451d9f49ccc2371a6573'

scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

base = 'dataframes'

### First loop got 198800 records taking 1225 seconds 
### Second loop got 920500 records taking 6194 seconds

In [30]:
start = time.time()
result = list()

columns = ["danceability", "energy", "key", "loudness", "mode", "speechiness",
        "acousticness", "instrumentalness", "liveness", "valence", "tempo",
        "type", "id", "uri", "track_href", "analysis_url", "duration_ms",
        "time_signature"]


tracks_df = pd.read_hdf(base + 'poopoo.h5', 'tracks')
tracks_df = tracks_df.rename({0:'track_uri'}, axis=1)

tracks_copy = tracks_df.copy()
for col in columns:
    tracks_copy[col] = ""

index = 0
for i in range(index, len(tracks_copy), 100):
    dic_values = {col: [] for col in columns}
    result = tracks_copy['track_uri'][i:i+100].tolist()
    features = sp.audio_features(result)
    for col in columns:
        for k in features:
            if k is None:
                dic_values[col].append(" ")
            else:
                dic_values[col].append(k[col])
             # print(k)
             # print(f'{col} {k[col]}')
        
        tracks_copy[col][i:i+100] = dic_values[col]
    #print(i) # to check progress of tracks
    break
print(f"time taken {time.time()-start:.2f} seconds")

time taken 0.39 seconds


In [31]:
tracks_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170089 entries, 0 to 170088
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   track_uri         170089 non-null  object
 1   danceability      170089 non-null  object
 2   energy            170089 non-null  object
 3   key               170089 non-null  object
 4   loudness          170089 non-null  object
 5   mode              170089 non-null  object
 6   speechiness       170089 non-null  object
 7   acousticness      170089 non-null  object
 8   instrumentalness  170089 non-null  object
 9   liveness          170089 non-null  object
 10  valence           170089 non-null  object
 11  tempo             170089 non-null  object
 12  type              170089 non-null  object
 13  id                170089 non-null  object
 14  uri               170089 non-null  object
 15  track_href        170089 non-null  object
 16  analysis_url      170089 non-null  obj

### Convert the the features to np.float64

In [26]:
float_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
           'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
for s in float_features:
    tracks_copy[s] = pd.to_numeric(tracks_copy[s], errors='coerce')

In [28]:
# save the dataframe into h5

tracks_copy.to_hdf(base + 'tracks.h5', 'tracks', nan_rep='null')

## Get artist information

In [5]:
def make_artist_df(path):
    artist = []
    artist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(f'{path}/{file}') as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    artist_tracks.append([track['artist_uri']])
    artist_tracks_df = pd.DataFrame(artist_tracks, columns=['artist_uri'])
    artist_tracks_df.to_hdf(base + '/artist_tracks.h5', 'artist_tracks')
    print('Stored files as h5')

make_artist_df('raw_data');

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.68it/s]


Stored files as h5


In [13]:
artist_tracks_df = pd.read_hdf(base + '/artist_tracks.h5', 'artist_tracks')
unique_artists = artist_tracks_df['artist_uri'].unique()

In [66]:
start = time.time()
result = dict()
genres = list()

for i in tqdm(range(0, len(unique_artists), 50)):
    results = sp.artists(unique_artists[i:i+50])
    for j in results['artists']:
        all_gen = ",".join(j['genres'])
        genres.append([j['uri'], j['name'], all_gen])

genres_df = pd.DataFrame(genres, columns=['artist_uri', 'name', 'genres'])
genres_df.to_hdf(base + '/genres.h5', 'genres')

print(f"time taken {time.time()-start:.2f} seconds")

100%|████████████████████████████████████████████████████████████████████████████████| 718/718 [01:10<00:00, 10.22it/s]

time taken 70.34 seconds





In [67]:
genres_df.head()

Unnamed: 0,artist_uri,name,genres
0,spotify:artist:2wIVse2owClT7go1WT98tk,Missy Elliott,"dance pop,hip hop,hip pop,pop,pop rap,r&b,rap,..."
1,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,"dance pop,pop,pop rap,post-teen pop"
2,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Beyoncé,"dance pop,pop,post-teen pop,r&b"
3,spotify:artist:31TPClRtHm23RisEBtV3X7,Justin Timberlake,"dance pop,pop"
4,spotify:artist:5EvFsr3kj42KNv97ZEnqij,Shaggy,"dance pop,pop rap,reggae fusion"
