In [2]:
#Install kaggle and spotipy apis to python 
!pip install spotipy
!pip install kaggle

Collecting spotipy
  Downloading spotipy-2.18.0-py3-none-any.whl (26 kB)
Collecting requests>=2.25.0
  Downloading requests-2.25.1-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 5.7 MB/s eta 0:00:01
[?25hCollecting urllib3>=1.26.0
  Downloading urllib3-1.26.4-py2.py3-none-any.whl (153 kB)
[K     |████████████████████████████████| 153 kB 6.0 MB/s eta 0:00:01
Installing collected packages: urllib3, requests, spotipy
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.11
    Uninstalling urllib3-1.25.11:
      Successfully uninstalled urllib3-1.25.11
  Attempting uninstall: requests
    Found existing installation: requests 2.24.0
    Uninstalling requests-2.24.0:
      Successfully uninstalled requests-2.24.0
Successfully installed requests-2.25.1 spotipy-2.18.0 urllib3-1.26.4
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.4 MB/s eta 0:00:011
Collecting python-slugi

Before importing kaggle, the kaggle.json api token file is needed. 
To obtain this you need to:
1. Sign into kaggle
2. Navidate to your account page
3. Scroll to the api section of the page
4. Select 'Create New API Token' and save the file

Once you have the kaggle.json file, it will need to be stored in the file location similar to 
/User/username/.kaggle/

To identify the exact location at which the json file is needed you can run the import kaggle command and the location will be displayed in the error message.

In [6]:
import kaggle
import zipfile
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import math
import numpy as np

In [7]:
#Initialize and authenticate the kaggle api  
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [8]:
#downloading a standalone kaggle dataset
api.dataset_download_file('dhruvildave/spotify-charts',
                          file_name='charts.csv',
                          path='./')

True

In [9]:
#Unzip the downloaded dataset 
with zipfile.ZipFile('charts.csv.zip', 'r') as zipref:
    zipref.extractall('./')

In [55]:
#read top 200 charts data into python
file_name='charts.csv'
top_tracks=pd.read_csv(file_name)

In [56]:
top_tracks.head()

Unnamed: 0,Position,Track Name,Artist,Streams,URL,Date,code,name
0,1,Reggaetón Lento (Bailemos),CNCO,6784,https://open.spotify.com/track/3AEZUABDXNtecAO...,2017-01-01,bo,Bolivia
1,2,Otra vez (feat. J Balvin),Zion & Lennox,5748,https://open.spotify.com/track/3QwBODjSEzelZyV...,2017-01-01,bo,Bolivia
2,3,Chantaje (feat. Maluma),Shakira,5506,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-01-01,bo,Bolivia
3,4,Vente Pa' Ca (feat. Maluma),Ricky Martin,4804,https://open.spotify.com/track/7DM4BPaS7uofFul...,2017-01-01,bo,Bolivia
4,5,Traicionera,Sebastian Yatra,4780,https://open.spotify.com/track/5J1c3M4EldCfNxX...,2017-01-01,bo,Bolivia


In [57]:
top_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9928033 entries, 0 to 9928032
Data columns (total 8 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Position    int64 
 1   Track Name  object
 2   Artist      object
 3   Streams     int64 
 4   URL         object
 5   Date        object
 6   code        object
 7   name        object
dtypes: int64(2), object(6)
memory usage: 606.0+ MB


In [341]:
# check how many rows are missing song url (URL includes song id which is used later to find song features 
# from spotify api)
top_tracks['URL'].isnull().sum()

17

In [346]:
# check all columns for null values 
top_tracks_no_na.isnull().sum()

Position         0
Track Name    1548
Artist        1532
Streams          0
URL              0
Date             0
code             0
name             0
id               0
dtype: int64

In [345]:
#Remove all rows without URLs 
top_tracks_no_na = top_tracks.dropna(subset=['URL']) 

In [347]:
#Add new column with song ids
song_ids = []
for i in range(0,len(top_tracks["URL"])):
    if pd.isnull(top_tracks["URL"][i]):
        song_ids.append('nan')
    else:
        id = top_tracks["URL"][i].split("https://open.spotify.com/track/")[1]
        song_ids.append(id)
top_tracks['id'] = song_ids

In [348]:
top_tracks.head()

Unnamed: 0,Position,Track Name,Artist,Streams,URL,Date,code,name,id
0,1,Reggaetón Lento (Bailemos),CNCO,6784,https://open.spotify.com/track/3AEZUABDXNtecAO...,2017-01-01,bo,Bolivia,3AEZUABDXNtecAOSC1qTfo
1,2,Otra vez (feat. J Balvin),Zion & Lennox,5748,https://open.spotify.com/track/3QwBODjSEzelZyV...,2017-01-01,bo,Bolivia,3QwBODjSEzelZyVjxPOHdq
2,3,Chantaje (feat. Maluma),Shakira,5506,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-01-01,bo,Bolivia,6mICuAdrwEjh6Y6lroV2Kg
3,4,Vente Pa' Ca (feat. Maluma),Ricky Martin,4804,https://open.spotify.com/track/7DM4BPaS7uofFul...,2017-01-01,bo,Bolivia,7DM4BPaS7uofFul3ywMe46
4,5,Traicionera,Sebastian Yatra,4780,https://open.spotify.com/track/5J1c3M4EldCfNxX...,2017-01-01,bo,Bolivia,5J1c3M4EldCfNxXwrwt8mT


In [351]:
top_tracks["URL"].nunique() 

88135

In [158]:
#place unique song urls into an array 
unique_urls = top_tracks["id"].unique()


In [352]:
len(unique_urls)

88136

In [21]:
#Define spotify client id and secret for application 
#cid = [insert client id from spotify]
#secret = [insert client secret from spotify]

#Create spotify client credential manager and authenticate 
client_credentials_manager = SpotifyClientCredentials(client_id = cid, client_secret = secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [160]:
#create a new dataframe for music features from spotify
music_features = pd.DataFrame(columns=('danceability', 'energy','key','loudness','mode','speechiness',
                                       'acousticness','instrumentalness','liveness','valence','tempo','type',
                                      'id','uri','track_href','analysis_url','duration_ms','time_signature'))

In [180]:
#Loop through the unique song urls, get the song features from the spotify api and append to music features dataframe
for i in range(0,len(unique_urls)):
    if unique_urls[i] not in music_features['id']:
        try:
            song_features = pd.DataFrame(sp.audio_features(unique_urls[i]))
            music_features = music_features.append(song_features)
        except:
            print("Error occured")

Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured
Error occured


Unnamed: 0,0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,,0.40000,https://api.spotify.com/v1/audio-analysis/3AEZ...,0.761,222560,0.838,3AEZUABDXNtecAOSC1qTfo,0,4,0.176,-3.073,0,0.0502,93.974,4,https://api.spotify.com/v1/tracks/3AEZUABDXNte...,audio_features,spotify:track:3AEZUABDXNtecAOSC1qTfo,0.710
0,,0.05590,https://api.spotify.com/v1/audio-analysis/3QwB...,0.832,209453,0.772,3QwBODjSEzelZyVjxPOHdq,0.000486,10,0.440,-5.429,1,0.1000,96.016,4,https://api.spotify.com/v1/tracks/3QwBODjSEzel...,audio_features,spotify:track:3QwBODjSEzelZyVjxPOHdq,0.704
0,,0.18700,https://api.spotify.com/v1/audio-analysis/6mIC...,0.852,195840,0.773,6mICuAdrwEjh6Y6lroV2Kg,3.05e-05,8,0.159,-2.921,0,0.0776,102.034,4,https://api.spotify.com/v1/tracks/6mICuAdrwEjh...,audio_features,spotify:track:6mICuAdrwEjh6Y6lroV2Kg,0.907
0,,0.00431,https://api.spotify.com/v1/audio-analysis/7DM4...,0.663,259196,0.920,7DM4BPaS7uofFul3ywMe46,1.69e-05,11,0.101,-4.070,0,0.2260,99.935,4,https://api.spotify.com/v1/tracks/7DM4BPaS7uof...,audio_features,spotify:track:7DM4BPaS7uofFul3ywMe46,0.533
0,,0.14200,https://api.spotify.com/v1/audio-analysis/5J1c...,0.776,228467,0.669,5J1c3M4EldCfNxXwrwt8mT,0,11,0.219,-4.933,1,0.0638,91.012,4,https://api.spotify.com/v1/tracks/5J1c3M4EldCf...,audio_features,spotify:track:5J1c3M4EldCfNxXwrwt8mT,0.661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.00212,https://api.spotify.com/v1/audio-analysis/4VYK...,0.822,123878,0.631,4VYKA63F2fcQnsvfWUpzn9,0,1,0.204,-6.271,1,0.2540,175.099,4,https://api.spotify.com/v1/tracks/4VYKA63F2fcQ...,audio_features,spotify:track:4VYKA63F2fcQnsvfWUpzn9,0.561
0,,0.00596,https://api.spotify.com/v1/audio-analysis/6rUm...,0.702,226000,0.580,6rUmUJ2drFemHTH73bFAKQ,0,1,0.125,-6.819,1,0.0563,145.926,4,https://api.spotify.com/v1/tracks/6rUmUJ2drFem...,audio_features,spotify:track:6rUmUJ2drFemHTH73bFAKQ,0.217
0,,0.50100,https://api.spotify.com/v1/audio-analysis/60Fa...,0.632,221946,0.606,60FasBAOZzjNCSoljbbASM,0.0129,9,0.107,-6.482,1,0.0281,108.023,4,https://api.spotify.com/v1/tracks/60FasBAOZzjN...,audio_features,spotify:track:60FasBAOZzjNCSoljbbASM,0.775
0,,0.96900,https://api.spotify.com/v1/audio-analysis/2NEz...,0.340,305786,0.446,2NEzxeMKW0mk5pr7tcrvs2,0.000454,0,0.128,-9.383,0,0.0479,177.284,4,https://api.spotify.com/v1/tracks/2NEzxeMKW0mk...,audio_features,spotify:track:2NEzxeMKW0mk5pr7tcrvs2,0.149


In [322]:
music_features.nunique() 

Unnamed: 0.1            1
acousticness         3599
danceability          889
duration_ms         49403
energy               1220
id                  88126
instrumentalness     5142
key                    12
liveness             1669
loudness            13366
mode                    2
speechiness          1473
tempo               37438
time_signature          5
type                    1
valence              1470
dtype: int64

In [315]:
#Drop unnecessary columns from music features dataframe 
music_features = music_features.drop(columns=['uri', 'track_href','analysis_url'])

In [300]:
# sorting by first name
music_features.sort_values("id", inplace = True)
  
# dropping ALL duplicte values
music_features.drop_duplicates(keep = 'first', inplace = True)

In [355]:
#write music features to file
music_features.to_csv("music_features.csv")

In [331]:
music_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,type,valence
160541,0.131,0.748,188491.0,0.627,000xQL6tZNLJzIrtIgxqSl,0.0,7.0,0.0852,-6.029,1.0,0.0639,120.963,4.0,audio_features,0.524
45268,0.684,0.509,187119.0,0.803,000xYdQfIZ4pDmBGzQalKU,0.000539,0.0,0.463,-6.743,1.0,0.04,166.018,4.0,audio_features,0.651
236479,0.105,0.772,167866.0,0.635,001b8t3bYPfnabpjpfG1Y4,0.0,9.0,0.117,-4.846,1.0,0.372,95.951,4.0,audio_features,0.541
218937,0.145,0.807,188395.0,0.552,001cKCt7jEOzXjqgkn9DQf,0.0,8.0,0.0999,-8.966,1.0,0.305,119.988,4.0,audio_features,0.287
193090,0.159,0.683,180933.0,0.676,003F0rm5lqxcmhvJPKgfaJ,0.0,1.0,0.0726,-6.688,0.0,0.147,98.992,4.0,audio_features,0.434


In [321]:
music_features.isnull().sum()


In [None]:
music_features = music_features.dropna() 

In [354]:
music_features.shape

(88126, 15)