# Following is the code for the data retrieval. 

In [29]:
#Initialization for the Spotify Credentials. It is required that you already have Spotipy and Pandas libraries installed.

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#I have changed the original OAuth2 file with my Client ID and Client Secret. But instead, you can initialize them here as well.
#cid = "Client ID"
#csecret = "CLient Secret"

client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

<h3> This next step would take a CSV file with Track IDs as an input. The process to get the Track IDs is explained in the report.

In [30]:
import pandas as pd

#Loading the data file in a dataframe.

data = pd.read_csv('Input_File_Name.csv', delimiter=',' ) #This takes CSV file with Track IDs as an input

<h3> Extracting the useful data columns from the original file.

In [31]:
track_id = data['URI']
track_name = data['Track Name']
artist_name = data ['Artist Name']

In [32]:
df=pd.DataFrame({'artist_name':artist_name,'track_name':track_name,'track_id':track_id})

print(df.shape) #To check the dataframe matrix
df.head()       #df.head is used to show only the initial 5 rows instead of outputting all the rows.

(894, 3)


Unnamed: 0,artist_name,track_id,track_name
0,"Levianth, Harley Bird",6WEKJTNRDOCLI3kdn06flO,Trapped
1,"Emdi, Veronica Bravo",1uu8eSAIRT7vaa66q46Ijr,Obsessed
2,"WHIPPED CREAM, Lil Xan",3INJvYa4XyBAm62TPNwSa8,Told Ya
3,Elko,6oS0nPho2XvKJfIMwV0ih0,Drunk and Out Of Town
4,Elko,69qTejtAq5P3Jd17571YEv,YOU’RE FEELING THE SAME THINGS


In [33]:
df.info()

#This step is done to check if there are null items or not.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 894 entries, 0 to 893
Data columns (total 3 columns):
artist_name    894 non-null object
track_id       894 non-null object
track_name     894 non-null object
dtypes: object(3)
memory usage: 21.0+ KB


<h3> Since the Spotify's Web API lets you put only 100 Track IDs as an input at once (discussed in detail in the report), the following code will iterate in  batchsize of 100.

In [34]:
rows = []
batchsize = 100
NoneC = 0

for i in range(0,len(df['track_id']),batchsize):
    batch = df['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            NoneC = NoneC + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:',NoneC)


('Number of tracks where no audio features were available:', 0)


In [35]:
print('number of elements in the Track IDs list:', len(rows))

('number of elements in the Track IDs list:', 894)


In [36]:
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", df_audio_features.shape)
df_audio_features.head()

('Shape of the dataset:', (894, 18))


Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.22,https://api.spotify.com/v1/audio-analysis/6WEK...,0.688,171904,0.657,6WEKJTNRDOCLI3kdn06flO,4e-05,0,0.326,-4.5,1,0.0323,115.966,4,https://api.spotify.com/v1/tracks/6WEKJTNRDOCL...,audio_features,spotify:track:6WEKJTNRDOCLI3kdn06flO,0.475
1,0.117,https://api.spotify.com/v1/audio-analysis/1uu8...,0.529,140571,0.831,1uu8eSAIRT7vaa66q46Ijr,0.0,6,0.185,-3.067,0,0.153,139.839,4,https://api.spotify.com/v1/tracks/1uu8eSAIRT7v...,audio_features,spotify:track:1uu8eSAIRT7vaa66q46Ijr,0.366
2,0.075,https://api.spotify.com/v1/audio-analysis/3INJ...,0.641,103252,0.694,3INJvYa4XyBAm62TPNwSa8,0.0,11,0.296,-4.349,1,0.239,170.156,4,https://api.spotify.com/v1/tracks/3INJvYa4XyBA...,audio_features,spotify:track:3INJvYa4XyBAm62TPNwSa8,0.37
3,0.0717,https://api.spotify.com/v1/audio-analysis/6oS0...,0.579,139373,0.848,6oS0nPho2XvKJfIMwV0ih0,2.8e-05,5,0.302,-3.838,0,0.0609,155.971,4,https://api.spotify.com/v1/tracks/6oS0nPho2XvK...,audio_features,spotify:track:6oS0nPho2XvKJfIMwV0ih0,0.489
4,0.00215,https://api.spotify.com/v1/audio-analysis/69qT...,0.565,145653,0.883,69qTejtAq5P3Jd17571YEv,0.000399,1,0.286,-3.43,1,0.0364,91.973,4,https://api.spotify.com/v1/tracks/69qTejtAq5P3...,audio_features,spotify:track:69qTejtAq5P3Jd17571YEv,0.443


In [37]:
df_audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 894 entries, 0 to 893
Data columns (total 18 columns):
acousticness        894 non-null float64
analysis_url        894 non-null object
danceability        894 non-null float64
duration_ms         894 non-null int64
energy              894 non-null float64
id                  894 non-null object
instrumentalness    894 non-null float64
key                 894 non-null int64
liveness            894 non-null float64
loudness            894 non-null float64
mode                894 non-null int64
speechiness         894 non-null float64
tempo               894 non-null float64
time_signature      894 non-null int64
track_href          894 non-null object
type                894 non-null object
uri                 894 non-null object
valence             894 non-null float64
dtypes: float64(9), int64(4), object(5)
memory usage: 125.8+ KB


<h3> Removing the unnecessary columns from the Audio Features list.

In [38]:
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1,inplace=True)

df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

df_audio_features.shape

(894, 14)

In [39]:
df_audio_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,track_id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.22,0.688,171904,0.657,6WEKJTNRDOCLI3kdn06flO,4e-05,0,0.326,-4.5,1,0.0323,115.966,4,0.475
1,0.117,0.529,140571,0.831,1uu8eSAIRT7vaa66q46Ijr,0.0,6,0.185,-3.067,0,0.153,139.839,4,0.366
2,0.075,0.641,103252,0.694,3INJvYa4XyBAm62TPNwSa8,0.0,11,0.296,-4.349,1,0.239,170.156,4,0.37
3,0.0717,0.579,139373,0.848,6oS0nPho2XvKJfIMwV0ih0,2.8e-05,5,0.302,-3.838,0,0.0609,155.971,4,0.489
4,0.00215,0.565,145653,0.883,69qTejtAq5P3Jd17571YEv,0.000399,1,0.286,-3.43,1,0.0364,91.973,4,0.443


<h3> Merging the Track IDs list with Audio Features list in a single dataframe.

In [40]:
Data_Frame = pd.merge(df,df_audio_features,on='track_id',how='inner')
print("Shape of the dataset:", df_audio_features.shape)
Data_Frame.head()

('Shape of the dataset:', (894, 14))


Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,"Levianth, Harley Bird",6WEKJTNRDOCLI3kdn06flO,Trapped,0.22,0.688,171904,0.657,4e-05,0,0.326,-4.5,1,0.0323,115.966,4,0.475
1,"Emdi, Veronica Bravo",1uu8eSAIRT7vaa66q46Ijr,Obsessed,0.117,0.529,140571,0.831,0.0,6,0.185,-3.067,0,0.153,139.839,4,0.366
2,"WHIPPED CREAM, Lil Xan",3INJvYa4XyBAm62TPNwSa8,Told Ya,0.075,0.641,103252,0.694,0.0,11,0.296,-4.349,1,0.239,170.156,4,0.37
3,Elko,6oS0nPho2XvKJfIMwV0ih0,Drunk and Out Of Town,0.0717,0.579,139373,0.848,2.8e-05,5,0.302,-3.838,0,0.0609,155.971,4,0.489
4,Elko,69qTejtAq5P3Jd17571YEv,YOU’RE FEELING THE SAME THINGS,0.00215,0.565,145653,0.883,0.000399,1,0.286,-3.43,1,0.0364,91.973,4,0.443


In [41]:
Data_Frame.info() # Just to confirm that there are no null values. 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 896 entries, 0 to 895
Data columns (total 16 columns):
artist_name         896 non-null object
track_id            896 non-null object
track_name          896 non-null object
acousticness        896 non-null float64
danceability        896 non-null float64
duration_ms         896 non-null int64
energy              896 non-null float64
instrumentalness    896 non-null float64
key                 896 non-null int64
liveness            896 non-null float64
loudness            896 non-null float64
mode                896 non-null int64
speechiness         896 non-null float64
tempo               896 non-null float64
time_signature      896 non-null int64
valence             896 non-null float64
dtypes: float64(9), int64(4), object(3)
memory usage: 119.0+ KB


In [42]:
Data_Frame.to_csv('Output_File_Name.csv')

# The output file will be a CSV file with the Audio Features merged with Track IDs list.