In [None]:
# TODO: Artist Distribution w/Number of Unique Artists
# TODO: Top 50 artists
# TODO: Volume of songs by artists
# TODO: Genre Distribution w/Top 50 & bottom 50 
# TODO: Songs by Year
# TODO: Popularity distribution
# TODO: Histogram of song length
# TODO: Longest songs
# TODO: Shortest songs
# TODO: songs by musical features w/distribution
# TODO: songs by time signature, key, major vs minor
# TODO: odd time signature analysis
# TODO: single playlist drill through
# TODO: dimensionality reduction of features
# TODO: outlier detection
# TODO: reccomendation based on what isnt an outlier
# TODO: songs close to outliers? 

In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spotipy
import os
from unidecode import unidecode
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv

In [None]:
# delcare constants
SEP = '&&'

In [None]:
# user auth
load_dotenv()

sp = spotipy.Spotify(
    auth_manager=SpotifyOAuth(
        client_id = os.environ['SPOTIFY_CLIENT_ID'],
        client_secret = os.environ['SPOTIFY_CLIENT_SECRET'],
        redirect_uri='https://example.com/callback',
        scope='user-library-read, playlist-read-private'
    )
)

current_user = sp.current_user()

user_id = current_user['id']

In [None]:
# get 20 most recent saved tracks
results = sp.current_user_saved_tracks()
for idx, item in enumerate(results['items']):
    track = item['track']
    print(idx, track['artists'][0]['name'], " - ", track['name'])

In [None]:
# set headers for dataframes
playlists_headers = ['idx', 'uri', 'id', 'name', 'owner', 'public']
track_headers = [
    'idx',                  
    'id', 'uri', 'name', 'popularity', 'preview_url',
    'duration_ms', 'explicit',
    'album_name', 'album_type', 'album_id', 'album_uri',
    'artists_names', 'artists_ids',
]

In [None]:
# get playlists for user
playlists = sp.current_user_playlists(limit=50)

In [None]:
# serialize playlists to csv
with open('./data/playlists.csv', 'w+') as pf:
    pf.write(SEP.join(playlists_headers) + '\n')
    for i, playlist in enumerate([x for x in playlists['items'] if x['owner']['id'] == user_id]):
        data = {}
        data['idx'] = i + 1 + playlists['offset']
        data['name'] = unidecode(playlist['name'])
        data['owner'] = playlist['owner']['id']
        data = {**data, **{x: playlist[x] for x in playlists_headers if x not in data.keys()}}
        s = SEP.join(['{'+ h +'}' for h in playlists_headers])
        pf.write(s.format(**data) + '\n')

In [None]:
# load playlists into dataframe
playlists_df = pd.read_csv(
    './data/playlists.csv', 
    sep=SEP, 
    engine='python',
    index_col='idx'
)
playlists_df.head()

In [None]:
# serialize tracks from playlists to csv
fields= 'href,limit,offset,next,previous,total,items.track(artists,explicit,href,id,name,popularity,preview_url,uri,duration_ms)' + 'items.track.album(album_type,total_tracks,href,id,name,release_date,type,uri)'

with open('./data/tracks.csv', 'w+') as tf:
    tf.write(SEP.join(track_headers) + '\n')
    for pid in playlists_df['id']:
        print(f'calling get items for playlist {pid}')
        p_items = sp.playlist_items(playlist_id=pid, fields=fields)
        while p_items['items']:
            for i, track in enumerate([x['track'] for x in p_items['items']]):
                data = {}
                data['idx'] = 1 + i + p_items['offset']
                album = track['album']
                artists = track['artists']
                data['album_type'] = album['album_type']
                data['album_name'] = unidecode(album['name'])
                data['artists_names'] = ', '.join([unidecode(x['name']) for x in artists])
                data['artists_ids'] = ','.join([x['id'] for x in artists])
                data['name'] = unidecode(track['name'])
                data = {**data, **{x: album[x.removeprefix('album_')] for x in track_headers if 'album' in x and x not in data.keys()}}
                data = {**data, **{x: track[x] for x in track_headers if x not in data.keys()}}
                s = SEP.join(['{'+ h +'}' for h in track_headers])
                tf.write(s.format(**data) + '\n')
            if p_items['next']:
                p_items = sp.next(p_items)
            else:
                p_items['items'] = None


In [None]:
# load tracks into dataframe and drop duplicates
tracks_df = pd.read_csv(
    './data/tracks.csv', 
    sep=SEP, 
    engine='python'
)
tracks_df = tracks_df.drop_duplicates(subset=['id'])
tracks_df.head()