# 4.3 Capstone Project - Data Wrangling: Spotify Data of 130k Songs
***
This notebook will work through the initial exploration of the Spotify data set.

### Imports and Data Read

In [2]:
# Imports

# imports for generic processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# imports for Spotify connection
import os
import sys
import json
import spotipy
import webbrowser
import spotipy.util as util


In [3]:
# reading the input Spotify data from tsv

df = pd.read_csv('C:\\Users\\Harsha Neel\\Documents\\GitHub\\Springboard\\Capstone Data\\songDB.tsv', sep = '\t', engine='python')
print("\n", df.head())
print("\n", df.shape)
print("\n", df.columns)


            Name  Danceability  Energy   Key  Loudness  Mode  Speechness  \
0    YuveYuveYu         0.624   0.857  10.0    -6.250   0.0      0.0542   
1   Gloryhammer         0.517   0.916   0.0    -4.933   1.0      0.0559   
2        Nostos         0.251   0.894   8.0    -4.103   0.0      0.0570   
3     Yggdrasil         0.469   0.743   1.0    -5.570   0.0      0.0272   
4  Incense&Iron         0.487   0.952   1.0    -4.429   0.0      0.0613   

   Acousticness  Instrumentalness  Liveness  Valence    Tempo            Type  \
0      0.020800          0.206000     0.110    0.324  131.926  audio_features   
1      0.000182          0.001910     0.306    0.444  135.996  audio_features   
2      0.014400          0.000000     0.123    0.297  114.223  audio_features   
3      0.002220          0.000111     0.276    0.481   86.953  audio_features   
4      0.000228          0.000000     0.161    0.329  125.993  audio_features   

                       ID                                   

In [67]:
# Cleaning up data - removing unnecessary columns, index by track ID etc.

# index by track ID
df_cleaned = df.set_index('ID')

df_cleaned = df_cleaned.drop(['Type', 'Uri', 'Ref_Track', 'URL_features'], axis=1)
print("\n", df_cleaned.head())
print("\n", df_cleaned.shape)
print("\n", df_cleaned.columns)

# Checking for Null entries
nulls = df_cleaned.columns[df_cleaned.isnull().any()]
print("\nColumns with Nulls are: \n", df_cleaned[nulls].isnull().sum())


                                 Name  Danceability  Energy   Key  Loudness  \
ID                                                                           
6J2VvzKwWc2f0JP5RQVZjq    YuveYuveYu         0.624   0.857  10.0    -6.250   
4HA34COgxgVJ6zK88UN4Ik   Gloryhammer         0.517   0.916   0.0    -4.933   
3W6Xik6Xxf06JuUoZSATlD        Nostos         0.251   0.894   8.0    -4.103   
2gGveBaLJQMtJ43X4UL5kH     Yggdrasil         0.469   0.743   1.0    -5.570   
1lRF81A1C9QoCgBcEop2zg  Incense&Iron         0.487   0.952   1.0    -4.429   

                        Mode  Speechness  Acousticness  Instrumentalness  \
ID                                                                         
6J2VvzKwWc2f0JP5RQVZjq   0.0      0.0542      0.020800          0.206000   
4HA34COgxgVJ6zK88UN4Ik   1.0      0.0559      0.000182          0.001910   
3W6Xik6Xxf06JuUoZSATlD   0.0      0.0570      0.014400          0.000000   
2gGveBaLJQMtJ43X4UL5kH   0.0      0.0272      0.002220          0.00011

In [68]:
### We see that there are some null entries. We need to examine this further.
### There are even 2 null song names, presumably missing from the "Name" column. These will need to be cleaned up.

print(df_cleaned[df_cleaned['Name'].isnull()])
print(df_cleaned[df_cleaned['Genre'].isnull()])

                       Name  Danceability  Energy  Key  Loudness  Mode  \
ID                                                                       
1PAhQVE5h3LKkEpdaTdpjq  NaN         0.714   0.839  1.0    -7.668   0.0   
1PAhQVE5h3LKkEpdaTdpjq  NaN         0.714   0.839  1.0    -7.668   0.0   

                        Speechness  Acousticness  Instrumentalness  Liveness  \
ID                                                                             
1PAhQVE5h3LKkEpdaTdpjq      0.0401       0.00237          0.000007    0.0795   
1PAhQVE5h3LKkEpdaTdpjq      0.0401       0.00237          0.000007    0.0795   

                        Valence   Tempo  Duration_ms time_signature  \
ID                                                                    
1PAhQVE5h3LKkEpdaTdpjq    0.614  98.003     237551.0            4.0   
1PAhQVE5h3LKkEpdaTdpjq    0.614  98.003     237551.0            4.0   

                             Genre  
ID                                  
1PAhQVE5h3LKkEpdaTdpjq 

In [69]:
### For the Name column, we see that the names themselves are missing. These 2 records can be dropped safely.

### For the records with the Null "Genre"column, these are probably null because the track IDs were not 
### formatted correctly when the data set was built. Given that we are not about to download Genre information
### from Spotify, these cannot be resupplied or repaired. So we will drop them as well from the dataset.

df_cleaned = df_cleaned.dropna()

print(df_cleaned.shape)
print(df_cleaned.isnull().any())

# Check for any duplicates
print(df_cleaned[df_cleaned.duplicated()].head())

# Removing duplicates
df_cleaned = df_cleaned.drop_duplicates()

print(df_cleaned.shape)

(131552, 15)
Name                False
Danceability        False
Energy              False
Key                 False
Loudness            False
Mode                False
Speechness          False
Acousticness        False
Instrumentalness    False
Liveness            False
Valence             False
Tempo               False
Duration_ms         False
time_signature      False
Genre               False
dtype: bool
                                            Name  Danceability   Energy   Key  \
ID                                                                              
4qgDlkfqsbZ0q5tOSp0nqE             BloodofHeroes        0.5480  0.91800   4.0   
3V417nSM4Ilh0Tt5CqustV                 Yggdrasil        0.4690  0.74300   1.0   
1KuNk1ZZPTZKwjjo1Vdw9G         FireBloodandSteel        0.5410  0.91200   0.0   
1Qf8YGAWyajekT32SDCpbB               IevanPolkka        0.5340  0.97200   0.0   
02h7YFPYqyKtSx2eAwHmHZ               Waldschrein        0.3350  0.99200   7.0   
5u3l2TONYacJgmRPQV

### Spotify Connection and Data Supplementation
***
<b>Now that the data is been read, we want to connect to Spotify and use the code to download the "genre" information using the track IDs from the data.
***
To connect with Spotify we need to use the Spotipy library. 
Detailed instructions on connecting with Spotify can be found in the library's [website](https://spotipy.readthedocs.io/en/latest/?highlight=playlist#spotipy.client.Spotify.category_playlists)
Documentation for the Spotify Web API can be found [here](https://developer.spotify.com/documentation/web-api/quick-start/)</b>

<h4>Summary on connecting with Spotify:</h4>
<ol>
<li> Create an account and register your test application. 
<li> Set up your Client ID and Secret, and set them up as environment variables in your local machine, from where you will run your application. There are other ways to dyanmically use your Client ID and Secret without setting them as environment variables. Read the documentation of the Spotify Web API and the spotipy client to explore other options.
<li> Import and call the desired APIs to download the specific data that you need, as I have done in the examples (and final run) below.
</ol>

In [70]:
### 1st method of connectivity. Through this method the credentials and are passed and the token is received, both implicitly. 
### Through this method, the token cannot be refreshed by us.
### Through this method, refreshing the token was not possible.

# Get the username from terminal
username = os.getlogin()
scope = 'user-library-read user-read-private'

# Erase cache and prompt for user permission
try : 
    token = util.prompt_for_user_token(username, scope)
except: 
    os.remove(f".cache-{username}")
    token = util.prompt_for_user_token(username, scope)
    
# Create Spotify object with permissions
spotifyObj = spotipy.Spotify(auth=token)

user = spotifyObj.current_user()

displayName = user['display_name']
followers = user['followers']['total']

print(displayName)

Harsha Neel


In [71]:
### 1st proof of concept before running through the entire data set - small number - first 4 items

# Getting the track IDs of a small set
trackIDs = df_cleaned.index[0:4]

# copy of cleaned df, with a small subset of data
df_copy = df_cleaned.iloc[0:4, :]

print("\n", trackIDs)
print("\n", type(trackIDs))
print("\n", trackIDs.shape)

# initializing lists to add as new columns later
artists = []
popularities = []

for trackID in trackIDs:
    
    track = spotifyObj.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. between artists.
    # Looking through many examples, I found that only 1 artist would exist in the list, thereby leading to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    artists.append(artist)
    popularities.append(popularity)

    print(artist, ": ", popularity)

# Assign the new columns to the final form of the df.
df_final = df_copy.assign(Artists = artists, Popularity = popularities)


print("\n", df_final)


 Index(['6J2VvzKwWc2f0JP5RQVZjq', '4HA34COgxgVJ6zK88UN4Ik',
       '3W6Xik6Xxf06JuUoZSATlD', '2gGveBaLJQMtJ43X4UL5kH'],
      dtype='object', name='ID')

 <class 'pandas.core.indexes.base.Index'>

 (4,)
The HU :  55
Gloryhammer :  21
Orion's Reign :  2
Brothers of Metal :  50

                                Name  Danceability  Energy   Key  Loudness  \
ID                                                                          
6J2VvzKwWc2f0JP5RQVZjq   YuveYuveYu         0.624   0.857  10.0    -6.250   
4HA34COgxgVJ6zK88UN4Ik  Gloryhammer         0.517   0.916   0.0    -4.933   
3W6Xik6Xxf06JuUoZSATlD       Nostos         0.251   0.894   8.0    -4.103   
2gGveBaLJQMtJ43X4UL5kH    Yggdrasil         0.469   0.743   1.0    -5.570   

                        Mode  Speechness  Acousticness  Instrumentalness  \
ID                                                                         
6J2VvzKwWc2f0JP5RQVZjq   0.0      0.0542      0.020800          0.206000   
4HA34COgxgVJ6zK88UN4Ik   1.0 

<b> NOTE: This is the 2nd method of connectivity - where I explicitly supply the client credentials to the call and receive the token which I can actually refresh.
***
The reason I'm using this method as the final one to download data is because of the token refresh possiblity. The token expires within a very short duration and getting data becomes a nuisance. So, to automate the token refresh process and continue downloading the data I need, I use this approach.</b>

In [72]:
### 2nd proof of concept before running through the entire data set - bigger number - first 2000 items


import time
import os
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials

client_id_str = 'SPOTIPY_CLIENT_ID'
client_secret_str = 'SPOTIPY_CLIENT_SECRET'
client_uri_str = 'SPOTIPY_REDIRECT_URI'

client_id = os.getenv(client_id_str)
client_sec = os.getenv(client_secret_str)
client_uri = os.getenv(client_uri_str)

# Get the username from terminal
username = os.getlogin()
scopes = 'user-library-read user-read-private'

sp_oauth = SpotifyOAuth(client_id=client_id,client_secret=client_sec,redirect_uri=client_uri,scope=scopes)
token_info = sp_oauth.get_cached_token() 
if not token_info:
    auth_url = sp_oauth.get_authorize_url()
    print(auth_url)
    response = input('Paste the above link into your browser, then paste the redirect url here: ')

    code = sp_oauth.parse_response_code(response)
    token_info = sp_oauth.get_access_token(code)

    token = token_info['access_token']
    print("\nInitial token", token)

sp = spotipy.Spotify(auth=token)

def refresh():
    global token_info, sp

    #if SpotifyOAuth..is_token_expired(token_info):
    print("\nRefreshing token..")
    token_info = sp_oauth.refresh_access_token(token_info['refresh_token'])
    token = token_info['access_token']
    print("\nRefreshed token", token)
    sp = spotipy.Spotify(auth=token)


# Getting the track IDs of a small set
trackIDs = df_cleaned.index[0:2000]

# copy of cleaned df, with a small subset of data
df_copy = df_cleaned.iloc[0:2000, :]

counter = 0
limiter = 500

start = time.time()

print("\n", "no of track IDs: ", trackIDs.shape[0])

# initializing lists to add as new columns later
artists = []
popularities = []

#spotifyObj = spotifyConnect()

for trackID in trackIDs:
    
    track = sp.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. between artists.
    # Looking through many examples, I found that only 1 artist would exist in the list, thereby leading to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    artists.append(artist)
    popularities.append(popularity)
    counter += 1
    
    if counter == limiter:
        
        counter = 0
        refresh()
        #spotifyConnect()
        #time.sleep(3)
      
     


    #print(artist, ": ", popularity)

# Assign the new columns to the final form of the df.
df_final = df_copy.assign(Artists = artists, Popularity = popularities)


end = time.time()

print("\n", df_final.head())

time_taken = end - start

print("time:", time_taken)

df_final.to_csv("2000ItemsFile.csv", sep=",")

https://accounts.spotify.com/authorize?client_id=da6d31d6cef1486ebc7707d2745e0535&response_type=code&redirect_uri=http%3A%2F%2Fgoogle.com%2F&scope=user-library-read+user-read-private
Paste the above link into your browser, then paste the redirect url here: https://www.google.com/?code=AQAba3_G99PJzrufVDGxLtkTXpHkretQ5chGXi2caQETCxzKi9bzQHXmPMuuIa16cSTztQYM_r9kvi0gl1RmJn_GsIZPM9Q2PRQPLeSCjr_dvovOVeJnBXKLx2LO9cSNG-KKpjqXVin7aIDF4KtR498tVC-o7bZ2bGF1cPrkLY5NfrzMTGUNhKhxkg9375G0gZMTyGrTSgapoytn2hs42VeFEPST9i0ozkUyEzG8

Initial token BQDYgLefgY8HNTgBbwgNweloMk-408x_7T99aKyq2CZWrtGwMSnf5i_1wSjhxS8fSpJq6palAO071aLyS0BRxpRbbPf349vQUZhEr-JQeaUDH60vJcfHw7_V4eOMkpVojMl3AtcEeKLz7cdgPN_0aN9fDd10KFrvEUs

 no of track IDs:  2000

Refreshing token..

Refreshed token BQBLzvnv2pi4t52424ibxTg7ihtOo8OmsgSwbMpGo0PzmIOEqXNbRQQlUQLk6a_5Dp0Do7UQIh1fR6lZUcb5Bm9hrAs7OZj5CkXVzrIzZlt_6fUGTsUisCiC8pDljLm5wKSDjLoZxdcw0hfGAYEy7de1EPVNCd1UEnMZofye1w

Refreshing token..

Refreshed token BQC8e2gPH93mGzd4oME3MuGCHeb1kZwJ

<b>This is the final and proper run - for all the data. Not a test. Using the 2nd connect method. I will let it run to completion. And with the full run, it will merge the data at the end. 
There is an intermediate csv write process, further explained below. 
***
Note: There are several better ways to handle this instead of performing an expensive io operation to write all the intermediate data onto disk, but I had only kept this step during the test run process, but didn't end up removing it during the final run.</b>

In [73]:
### Final run - the entire data set - 121815 items

import time
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials


# env variables - previously set - which are required to establish an Oauth connection

# env variable keys
client_id_str = 'SPOTIPY_CLIENT_ID'
client_secret_str = 'SPOTIPY_CLIENT_SECRET'
client_uri_str = 'SPOTIPY_REDIRECT_URI'

# env variable values
client_id = os.getenv(client_id_str)
client_sec = os.getenv(client_secret_str)
client_uri = os.getenv(client_uri_str)

# Get the username from terminal
username = os.getlogin()
scopes = 'user-library-read user-read-private'

# Spotify connection logic. This was required to be brought into this cell 
# for repeated execution to retrieve the entire dataset without the authentication token expiring.

# creates an Oauth connection using the client credentials
sp_oauth = SpotifyOAuth(client_id=client_id,client_secret=client_sec,redirect_uri=client_uri,scope=scopes)
token_info = sp_oauth.get_cached_token() 
if not token_info:
    auth_url = sp_oauth.get_authorize_url()
    print(auth_url)
    response = input('Paste the above link into your browser, then paste the redirect url here: ')

    code = sp_oauth.parse_response_code(response)
    token_info = sp_oauth.get_access_token(code)

    token = token_info['access_token']
    #print("\nInitial token", token)

sp = spotipy.Spotify(auth=token)

def refresh():
    global token_info, sp

    #if SpotifyOAuth..is_token_expired(token_info):
    print("\nRefreshing token..")
    token_info = sp_oauth.refresh_access_token(token_info['refresh_token'])
    token = token_info['access_token']
    #print("Refreshed token", token)
    sp = spotipy.Spotify(auth=token)


# Getting the track IDs of a small set
trackIDs = df_cleaned.index

# counter for the spotify token refresh
counter = 0

# Once the "counter" reaches the "limiter" number - 800, the token refresh call will be made.
# i.e., it downloads 800 records before refreshing.
limiter = 800

# The refresh counter is simply for me to keep track of how many times the token has been "refreshed", 
# and that includes the first time (even though that's not techincally a "refresh")
refreshCounter = 1

start = time.time()

print("\n", "no of track IDs: ", trackIDs.shape[0])

# initializing lists to add as new columns later
artists = []
popularities = []
ids = []

for trackID in trackIDs:
    
    track = sp.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. 
    # between artists. Looking through many examples, I found that only 1 artist would exist in the list, thereby leading
    # to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    ids.append(trackID)
    artists.append(artist)
    popularities.append(popularity)
    counter += 1
    
    
    # when counter reaches the limiter, the token will be refreshed
    if counter == limiter:
        data = {'ID':ids,'Artist':artists,'Popularity':popularities}
        df_temp = pd.DataFrame(data)
        
        # This csv write, is to capture the intermediate data in case, there is any other failure, so that I may see at what
        # record it stopped.
        filename = "temp" + str(refreshCounter) + ".csv"
        df_temp.to_csv(filename, sep=",")
        
        # The reason for using this appraoch is because even though Spotipy's documentation shows a method to check for the
        # token's expiry, it does not actually exist in the library, strangely. So, I need to make a periodic refresh, and 
        # it's always safe to use a smaller number for such cases.
        counter = 0
        refreshCounter += 1
        refresh()
    

# Assign the new columns to the final form of the df.
df_final = df_cleaned.assign(Artists = artists, Popularity = popularities)

print("\n", "Token refreshed a total of ", refreshCounter, " times")

end = time.time()

#print("\n", df_final.head())

time_taken = end - start

print("time (seconds):", time_taken)

df_final.to_csv("FinalData.csv", sep=",")

https://accounts.spotify.com/authorize?client_id=da6d31d6cef1486ebc7707d2745e0535&response_type=code&redirect_uri=http%3A%2F%2Fgoogle.com%2F&scope=user-library-read+user-read-private
Paste the above link into your browser, then paste the redirect url here: https://www.google.com/?code=AQA1Aw1dFyjWVBdkjb4cfc4Tupir_6ptHywRlc8pZxTpa0evNmOSNj8uG3nvhwgwCuTAI7tDgfiB3xukZGB2x5GDsfB-6kAXUFVV1zqhu3FsM3I90LF9dgtC5mLDZAB9lVBv8yHyTQiiumCxTe6s_RTfE5IFe0KqxyntaeQuOoPEZVLTZAvvjOkGujL453_L66bOHRJIPxhGmdYBJANvil5r1KdV6jgtF5xNk_49

 no of track IDs:  121815

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing token..

Refreshing to

In [78]:
# Checking post-download 
df_final.info()

print(df_final.describe())

df_main = pd.read_csv("FinalData.csv", sep=",")

<class 'pandas.core.frame.DataFrame'>
Index: 121815 entries, 6J2VvzKwWc2f0JP5RQVZjq to 6XoNBGHoWlqMtSTob5heto
Data columns (total 17 columns):
Name                121815 non-null object
Danceability        121815 non-null float64
Energy              121815 non-null float64
Key                 121815 non-null float64
Loudness            121815 non-null float64
Mode                121815 non-null float64
Speechness          121815 non-null float64
Acousticness        121815 non-null float64
Instrumentalness    121815 non-null float64
Liveness            121815 non-null float64
Valence             121815 non-null float64
Tempo               121815 non-null object
Duration_ms         121815 non-null float64
time_signature      121815 non-null object
Genre               121815 non-null object
Artists             121815 non-null object
Popularity          121815 non-null int64
dtypes: float64(11), int64(1), object(5)
memory usage: 16.7+ MB
        Danceability         Energy            Key  

<b>As we see, the data looks good. There are no warnings while trying to read the CSV file either.
***
With this, the wrangling process has been completed.</b>