# Capstone Project - Data Wrangling: Spotify Data of 130k Songs
***
This notebook will work through the initial exploration of the Spotify data set.

### Imports and Data Read

In [176]:
# Imports

# imports for generic processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# imports for Spotify connection
import os
import sys
import json
import spotipy
import webbrowser
import spotipy.util as util


In [177]:
# reading the input Spotify data from tsv

df = pd.read_csv('C:\\Users\\Harsha Neel\\Documents\\GitHub\\Springboard\\Capstone Data\\songDB.tsv', sep = '\t', engine='python')
print("\n", df.head())
print("\n", df.shape)
print("\n", df.columns)


            Name  Danceability  Energy   Key  Loudness  Mode  Speechness  \
0    YuveYuveYu         0.624   0.857  10.0    -6.250   0.0      0.0542   
1   Gloryhammer         0.517   0.916   0.0    -4.933   1.0      0.0559   
2        Nostos         0.251   0.894   8.0    -4.103   0.0      0.0570   
3     Yggdrasil         0.469   0.743   1.0    -5.570   0.0      0.0272   
4  Incense&Iron         0.487   0.952   1.0    -4.429   0.0      0.0613   

   Acousticness  Instrumentalness  Liveness  Valence    Tempo            Type  \
0      0.020800          0.206000     0.110    0.324  131.926  audio_features   
1      0.000182          0.001910     0.306    0.444  135.996  audio_features   
2      0.014400          0.000000     0.123    0.297  114.223  audio_features   
3      0.002220          0.000111     0.276    0.481   86.953  audio_features   
4      0.000228          0.000000     0.161    0.329  125.993  audio_features   

                       ID                                   

In [178]:
# Cleaning up data - removing unnecessary columns, index by track ID etc.

# index by track ID
df_cleaned = df.set_index('ID')

df_cleaned = df_cleaned.drop(['Type', 'Uri', 'Ref_Track', 'URL_features'], axis=1)
print("\n", df_cleaned.head())
print("\n", df_cleaned.shape)
print("\n", df_cleaned.columns)



                                 Name  Danceability  Energy   Key  Loudness  \
ID                                                                           
6J2VvzKwWc2f0JP5RQVZjq    YuveYuveYu         0.624   0.857  10.0    -6.250   
4HA34COgxgVJ6zK88UN4Ik   Gloryhammer         0.517   0.916   0.0    -4.933   
3W6Xik6Xxf06JuUoZSATlD        Nostos         0.251   0.894   8.0    -4.103   
2gGveBaLJQMtJ43X4UL5kH     Yggdrasil         0.469   0.743   1.0    -5.570   
1lRF81A1C9QoCgBcEop2zg  Incense&Iron         0.487   0.952   1.0    -4.429   

                        Mode  Speechness  Acousticness  Instrumentalness  \
ID                                                                         
6J2VvzKwWc2f0JP5RQVZjq   0.0      0.0542      0.020800          0.206000   
4HA34COgxgVJ6zK88UN4Ik   1.0      0.0559      0.000182          0.001910   
3W6Xik6Xxf06JuUoZSATlD   0.0      0.0570      0.014400          0.000000   
2gGveBaLJQMtJ43X4UL5kH   0.0      0.0272      0.002220          0.00011

### Spotify Connection and Data Supplementation
***
<b>Now that the data is been read, we want to connect to Spotify and use the code to download the "genre" information using the track IDs from the data.
***
To connect with Spotify we need to use the Spotipy library. 
Detailed instructions on connecting with Spotify can be found in the library's [website](https://spotipy.readthedocs.io/en/latest/?highlight=playlist#spotipy.client.Spotify.category_playlists)</b>

<h4>Summary on connecting with Spotify:</h4>
<ol>
<li> Placeholder1
<li> Placeholder2
</ol>

In [165]:
# Get the username from terminal
username = os.getlogin()
scope = 'user-library-read user-read-private'

# Erase cache and prompt for user permission
try : 
    token = util.prompt_for_user_token(username, scope)
except: 
    os.remove(f".cache-{username}")
    token = util.prompt_for_user_token(username, scope)
    
# Create Spotify object with permissions
spotifyObj = spotipy.Spotify(auth=token)

user = spotifyObj.current_user()

displayName = user['display_name']
followers = user['followers']['total']

print(displayName)

Harsha Neel


In [179]:
### 1st proof of concept before running through the entire data set - small number - first 4 items

# Getting the track IDs of a small set
trackIDs = df_cleaned.index[0:4]

# copy of cleaned df, with a small subset of data
df_copy = df_cleaned.iloc[0:4, :]

print("\n", trackIDs)
print("\n", type(trackIDs))
print("\n", trackIDs.shape)

# initializing lists to add as new columns later
artists = []
popularities = []

for trackID in trackIDs:
    
    track = spotifyObj.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. between artists.
    # Looking through many examples, I found that only 1 artist would exist in the list, thereby leading to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    artists.append(artist)
    popularities.append(popularity)

    print(artist, ": ", popularity)

# Assign the new columns to the final form of the df.
df_final = df_copy.assign(Artists = artists, Popularity = popularities)


print("\n", df_final)


 Index(['6J2VvzKwWc2f0JP5RQVZjq', '4HA34COgxgVJ6zK88UN4Ik',
       '3W6Xik6Xxf06JuUoZSATlD', '2gGveBaLJQMtJ43X4UL5kH'],
      dtype='object', name='ID')

 <class 'pandas.core.indexes.base.Index'>

 (4,)


SpotifyException: http status: 401, code:-1 - https://api.spotify.com/v1/tracks/6J2VvzKwWc2f0JP5RQVZjq:
 The access token expired

In [183]:
### 2nd proof of concept before running through the entire data set - bigger number - first 1000 items

import time

def spotifyConnect():
    # Get the username from terminal
    username = os.getlogin()
    scope = 'user-library-read user-read-private'

    # Erase cache and prompt for user permission
    try : 
        token = util.prompt_for_user_token(username, scope)
    except: 
        os.remove(f".cache-{username}")
        token = util.prompt_for_user_token(username, scope)
    
    # Create Spotify object with permissions
    spotifyObj = spotipy.Spotify(auth=token)

    user = spotifyObj.current_user()

    displayName = user['display_name']
    followers = user['followers']['total']

    print("\n", displayName, " connected to Spotify")
    return spotifyObj


# Getting the track IDs of a small set
trackIDs = df_cleaned.index[0:1000]

# copy of cleaned df, with a small subset of data
df_copy = df_cleaned.iloc[0:1000, :]

counter = 0
limiter = 200

start = time.time()

print("\n", "no of track IDs: ", trackIDs.shape)

# initializing lists to add as new columns later
artists = []
popularities = []

spotifyObj = spotifyConnect()

for trackID in trackIDs:
    
    track = spotifyObj.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. between artists.
    # Looking through many examples, I found that only 1 artist would exist in the list, thereby leading to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    artists.append(artist)
    popularities.append(popularity)
    counter += 1
    
    if counter == limiter:
        counter = 0
        spotifyConnect()
        time.sleep(3)
      
     


    #print(artist, ": ", popularity)

# Assign the new columns to the final form of the df.
df_final = df_copy.assign(Artists = artists, Popularity = popularities)

end = time.time()

print("\n", df_final.head())

time_taken = end - start

print("time:", time_taken)

df_final.to_csv("1000ItemsFile.csv", sep=",")


 no of track IDs:  (1000,)
Harsha Neel

Connected to Spotify
Harsha Neel

Connected to Spotify
Harsha Neel

Connected to Spotify
Harsha Neel

Connected to Spotify
Harsha Neel

Connected to Spotify
Harsha Neel

Connected to Spotify

                                 Name  Danceability  Energy   Key  Loudness  \
ID                                                                           
6J2VvzKwWc2f0JP5RQVZjq    YuveYuveYu         0.624   0.857  10.0    -6.250   
4HA34COgxgVJ6zK88UN4Ik   Gloryhammer         0.517   0.916   0.0    -4.933   
3W6Xik6Xxf06JuUoZSATlD        Nostos         0.251   0.894   8.0    -4.103   
2gGveBaLJQMtJ43X4UL5kH     Yggdrasil         0.469   0.743   1.0    -5.570   
1lRF81A1C9QoCgBcEop2zg  Incense&Iron         0.487   0.952   1.0    -4.429   

                        Mode  Speechness  Acousticness  Instrumentalness  \
ID                                                                         
6J2VvzKwWc2f0JP5RQVZjq   0.0      0.0542      0.020800          0.2

In [169]:
### Final run - the entire data set - 131580 items

import time

# Spotify connection function. This was required to be brought into this cell 
# for repeated execution to retrieve the entire dataset without the authentication token expiring.
def spotifyConnect():
    # Get the username from terminal
    username = os.getlogin()
    scope = 'user-library-read user-read-private'

    # Erase cache and prompt for user permission
    try : 
        token = util.prompt_for_user_token(username, scope)
    except: 
        os.remove(f".cache-{username}")
        token = util.prompt_for_user_token(username, scope)
    
    # Create Spotify object with permissions
    spotifyObj = spotipy.Spotify(auth=token)

    user = spotifyObj.current_user()

    displayName = user['display_name']
    followers = user['followers']['total']

    print("\n", displayName, " connected to Spotify")
    return spotifyObj


# Getting the track IDs of a small set
trackIDs = df_cleaned.index

# counters for the spotify timing reconnect
counter = 0
limiter = 1000
connectcount = 1

start = time.time()

print("\n", "no of track IDs: ", trackIDs.shape)

# initializing lists to add as new columns later
artists = []
popularities = []

spotifyObj = spotifyConnect()

for trackID in trackIDs:
    
    track = spotifyObj.track(trackID)
    popularity = track['popularity']
    
    # The artist object comes back as a list of "artists", even for a single song - possibly due to collabs and feat. etc. between artists.
    # Looking through many examples, I found that only 1 artist would exist in the list, thereby leading to the assumption below.
    # Assumption - the main artist of the song will always be the first in the list. I think that's fair.
    artist = track['album']['artists'][0]['name']
    
    artists.append(artist)
    popularities.append(popularity)
    counter += 1
    
    # when counter reaches the limiter, the spotify connection will be re-established
    if counter == limiter:
        counter = 0
        connectcount += 1
        spotifyConnect()
        time.sleep(3)
    

# Assign the new columns to the final form of the df.
df_final = df_cleaned.assign(Artists = artists, Popularity = popularities)

print("\n", "Spotify connected total ", connectcount, " times")

end = time.time()

print("\n", df_final.head())

time_taken = end - start

print("time:", time_taken)

df_final.to_csv("FinalData.csv", sep=",")


 no of track IDs:  (131580,)


SpotifyException: http status: 401, code:-1 - https://api.spotify.com/v1/tracks/2Ca83UoPBkctVkpDo8YLqu:
 The access token expired