# Load Spotify Data

Create a Spotify Developer Account and generate a Client ID & Secret Key. 

For the first step, we have created a public playlist with random songs.

In [6]:
import pandas as pd
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()
# test
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

In [7]:
data = []

results = sp.playlist_items("5Rh7ikX5dteMXfc8tmeBJy")
for idx, item in enumerate(results['items']):
    track = item['track']

    features = sp.audio_features(track["id"])
    track_features = features[0]

    data.append([
        track["id"],                        # string
        track["name"],                      # string
        track['artists'][0]['name'],        # string
        track['album']['name'],             # string
        track["popularity"],                # integer  0-100  % (current trendiness)
        track["duration_ms"],               # integer
        track_features["acousticness"],     # float    0-1    probability (whether the track uses instrumental rather than electric sounds)
        track_features["danceability"],     # float    0-1    probability (suitability for dancing)
        track_features["energy"],           # float    0-1    % (intensity / activity)
        track_features["instrumentalness"], # float    0-1    probability (whether the track contains no vocals)
        track_features["key"],              # integer  -1-11  (-1: none identified, 0: C, ...)
        track_features["liveness"],         # float    0-1    probability (whether the song is a live performance)
        track_features["loudness"],         # float           dB
        track_features["mode"],             # integer  0,1    (0: minor or 1: major)
        track_features["speechiness"],      # float    0-1    % (how many words are spoken rather than sang)
        track_features["tempo"],            # float           BPM
        track_features["time_signature"],   # integer  3-7    time signature (x/4, example: 3/4)
        track_features["valence"],          # float    0-1    spectrum (0: sad / angry, 1: happy)
        # TODO add explicit
        # TODO add release year
    ])

In [8]:
df = pd.DataFrame(data, columns=['id', 'name', 'artist', 'album', 'popularity', 'duration', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'])

updated = df.drop([
    'artist', # irrelevant
    'album', # irrelevant
    'popularity', # popularity isn't suitable to measure similarity of songs
    'duration', # not relevant to measure song similarity
    'liveness', # not relevant to measure song similarity
    # TODO loudness?
    # TODO mode?
    # TODO do we need tempo AND time_signature?
], axis='columns')

print(updated.columns)

Index(['id', 'name', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'loudness', 'mode', 'speechiness', 'tempo',
       'time_signature', 'valence'],
      dtype='object')


In [10]:
df.head()

Unnamed: 0,id,name,artist,album,popularity,duration,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,1uvyZBs4IZYRebHIB1747m,Purple Rain,Prince,Purple Rain,72,521866,0.0354,0.373,0.452,0.00227,10,0.689,-10.417,1,0.0321,113.212,4,0.179
1,3S2R0EVwBSAVMd5UMgKTL0,Thriller,Michael Jackson,Thriller 25 Super Deluxe Edition,73,357266,0.0855,0.773,0.859,0.000187,11,0.914,-4.913,1,0.0747,118.459,4,0.813
2,2374M0fQpWi3dLnB54qaLX,Africa,TOTO,Toto IV,84,295893,0.257,0.671,0.373,8e-05,9,0.0481,-18.064,1,0.0323,92.718,4,0.732
3,4RvWPyQ5RL0ao9LPZeSouE,Everybody Wants To Rule The World,Tears For Fears,Songs From The Big Chair (Super Deluxe Edition),85,251488,0.347,0.645,0.795,0.00389,7,0.104,-12.095,1,0.0527,112.067,4,0.535
4,2MuWTIM3b0YEAskbeeFE1i,Master Of Puppets,Metallica,Master Of Puppets (Remastered),75,515386,0.000647,0.543,0.836,0.431,4,0.153,-9.11,0,0.0353,105.173,4,0.56


As we see, Spotify already develiver good quality data. So we don't need to do data assessments.