# Connecting to Spotify API and Scraping from playlists.

---

### In this notebook, we will explore the spotify API, extract songs from playlists, which will be used to build a database of music.

---

## Import Libraries

In [1]:
import pandas as pd
import re
import random
from random import randint
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep
from tqdm import tqdm


## Import Spotify API credentials. 

In [2]:
secrets_file = open("../secrets.txt","r")

In [3]:
string = secrets_file.read()

In [4]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [5]:
#InitializeSpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

## Extract song information from spotify "spotipy" api wrapper

In [6]:
def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3))
    return tracks

In [7]:
playlist1 = get_playlist_tracks("6tIxyT1Gq6O7DK7rIEUEZo")
playlist2 = get_playlist_tracks("4pbDDX7np7Q1H0ghL7U9o7")

In [8]:
#Divide the playlists into smaller sections to mitigate the requests failures.

#for playlist1
spotlist0 = playlist1[0:1000]
spotlist1 = playlist1[1000:2000]
spotlist2 = playlist1[2000:3000]
spotlist3 = playlist1[3000:4000]
spotlist4 = playlist1[4000:5000]
spotlist5 = playlist1[5000:6000]
spotlist6 = playlist1[6000:7000]
spotlist7 = playlist1[7000:8000]
spotlist8 = playlist1[8000:9000]
spotlist9 = playlist1[9000:10000]


#for playlist2
spotlist10 = playlist2[0:1000]
spotlist11 = playlist2[1000:2000]
spotlist12 = playlist2[2000:3000]
spotlist13 = playlist2[3000:3879]

## Extract and transform the features, transforming everything into a dataframe

In [9]:
def artist_song(x):
    
    artist = []
    song = []
    features = []
    counter = 0
    
    for i in tqdm(x):
        song.append(x[counter]['track']['name'])
        artist.append(x[counter]['track']['artists'][0]['name'])
        #features.append(x[counter]["track"]["uri"])
        features.append(sp.audio_features(x[counter]["track"]["uri"]))
        counter += 1
        if counter == 200:
            sleep(randint(15,25))
            counter = 0
    return pd.DataFrame({'artist': artist, 'song': song, 'features': features})

**We will run the playlists three at a time to ensure quality of transformation.**

In [11]:
#sleep(86400) #fri 19:09
sleep(32400)

In [12]:
%%time
spotlist0df= artist_song(spotlist0)
spotlist1df= artist_song(spotlist1)

100%|███████████████████████████████████████| 1000/1000 [02:50<00:00,  5.85it/s]
100%|███████████████████████████████████████| 1000/1000 [02:59<00:00,  5.58it/s]

CPU times: user 19.6 s, sys: 4.65 s, total: 24.3 s
Wall time: 5min 51s





In [20]:
sleep(13480) #sat 23:17

In [21]:
%%time
spotlist2df= artist_song(spotlist2)
spotlist3df= artist_song(spotlist3)


100%|███████████████████████████████████████| 1000/1000 [02:51<00:00,  5.84it/s]
100%|███████████████████████████████████████| 1000/1000 [03:00<00:00,  5.54it/s]

CPU times: user 20.6 s, sys: 3.99 s, total: 24.5 s
Wall time: 5min 51s





In [23]:
sleep(38700) #sun 23:25

In [24]:
%%time
spotlist4df= artist_song(spotlist4)
spotlist5df= artist_song(spotlist5)


100%|███████████████████████████████████████| 1000/1000 [03:15<00:00,  5.12it/s]
100%|███████████████████████████████████████| 1000/1000 [03:04<00:00,  5.41it/s]

CPU times: user 20.5 s, sys: 4.12 s, total: 24.6 s
Wall time: 6min 20s





In [25]:
sleep(86400) #mon 23:35am

In [26]:
%%time
spotlist6df= artist_song(spotlist6)
spotlist7df= artist_song(spotlist7)


100%|███████████████████████████████████████| 1000/1000 [02:56<00:00,  5.66it/s]
100%|███████████████████████████████████████| 1000/1000 [02:55<00:00,  5.71it/s]

CPU times: user 21.2 s, sys: 4.16 s, total: 25.4 s
Wall time: 5min 51s





In [29]:
sleep(17550) #tue 23:44am

In [30]:
%%time
spotlist8df= artist_song(spotlist8)
spotlist9df= artist_song(spotlist9)

100%|███████████████████████████████████████| 1000/1000 [02:57<00:00,  5.63it/s]
100%|█████████████████████████████████████████| 999/999 [02:44<00:00,  6.06it/s]


CPU times: user 20.1 s, sys: 4.02 s, total: 24.1 s
Wall time: 5min 42s


In [31]:
sleep(86400) #wed 23:51

In [32]:
%%time
spotlist10df= artist_song(spotlist10)
spotlist11df= artist_song(spotlist11)

100%|███████████████████████████████████████| 1000/1000 [03:11<00:00,  5.22it/s]
100%|███████████████████████████████████████| 1000/1000 [03:16<00:00,  5.08it/s]

CPU times: user 20 s, sys: 4.16 s, total: 24.1 s
Wall time: 6min 28s





In [33]:
sleep(86400) #thu 23:57

In [34]:
%%time
spotlist12df= artist_song(spotlist12)
spotlist13df= artist_song(spotlist13)

100%|███████████████████████████████████████| 1000/1000 [02:48<00:00,  5.93it/s]
100%|█████████████████████████████████████████| 878/878 [02:29<00:00,  5.87it/s]

CPU times: user 19.4 s, sys: 4.01 s, total: 23.4 s
Wall time: 5min 18s





**Check three of the lists to ensure that the API has not caused any errors.**

In [36]:
display(spotlist0df.head(2))
display(spotlist6df.head(2))
display(spotlist12df.head(2))

Unnamed: 0,artist,song,features
0,Queen,A Kind Of Magic - Remastered 2011,"[{'danceability': 0.67, 'energy': 0.776, 'key'..."
1,Queen,Bohemian Rhapsody - Remastered 2011,"[{'danceability': 0.414, 'energy': 0.404, 'key..."


Unnamed: 0,artist,song,features
0,Blind Guardian,"Sacred Worlds - Extended ""Sacred""","[{'danceability': 0.154, 'energy': 0.908, 'key..."
1,Blind Guardian,Majesty,"[{'danceability': 0.252, 'energy': 0.861, 'key..."


Unnamed: 0,artist,song,features
0,Europe,Ninja,"[{'danceability': 0.471, 'energy': 0.882, 'key..."
1,Europe,Cherokee,"[{'danceability': 0.585, 'energy': 0.804, 'key..."


In [38]:
spotlistdf = [spotlist0df, spotlist1df, spotlist2df, spotlist3df, spotlist4df, spotlist5df, spotlist6df, spotlist7df, spotlist8df, spotlist9df, spotlist10df, spotlist11df, spotlist12df, spotlist13df]

playlist = pd.concat(spotlistdf, axis = 0).reset_index(drop = True)
playlist.head()

Unnamed: 0,artist,song,features
0,Queen,A Kind Of Magic - Remastered 2011,"[{'danceability': 0.67, 'energy': 0.776, 'key'..."
1,Queen,Bohemian Rhapsody - Remastered 2011,"[{'danceability': 0.414, 'energy': 0.404, 'key..."
2,Queen,Another One Bites The Dust - Remastered 2011,"[{'danceability': 0.933, 'energy': 0.528, 'key..."
3,Queen,Don't Stop Me Now - Remastered 2011,"[{'danceability': 0.559, 'energy': 0.868, 'key..."
4,Queen,Under Pressure - Remastered 2011,"[{'danceability': 0.671, 'energy': 0.712, 'key..."


 **We can see that the features are packed into a list. We will need to unpack these and disperse them into columns**

In [40]:
#unpacking the features 
playlist['features'] = pd.json_normalize(playlist['features'])


In [41]:
playlist

Unnamed: 0,artist,song,features
0,Queen,A Kind Of Magic - Remastered 2011,"{'danceability': 0.67, 'energy': 0.776, 'key':..."
1,Queen,Bohemian Rhapsody - Remastered 2011,"{'danceability': 0.414, 'energy': 0.404, 'key'..."
2,Queen,Another One Bites The Dust - Remastered 2011,"{'danceability': 0.933, 'energy': 0.528, 'key'..."
3,Queen,Don't Stop Me Now - Remastered 2011,"{'danceability': 0.559, 'energy': 0.868, 'key'..."
4,Queen,Under Pressure - Remastered 2011,"{'danceability': 0.671, 'energy': 0.712, 'key'..."
...,...,...,...
13872,Chris Cornell,Silence The Voices,"{'danceability': 0.429, 'energy': 0.581, 'key'..."
13873,Chris Cornell,Disappearing Act,"{'danceability': 0.312, 'energy': 0.443, 'key'..."
13874,Chris Cornell,"You Know My Name - From ""Casino Royale"" Soundt...","{'danceability': 0.371, 'energy': 0.86, 'key':..."
13875,Chris Cornell,Today - Non-LP Version,"{'danceability': 0.622, 'energy': 0.832, 'key'..."


In [42]:
#creating new dataframes with the features in their own columns
playlistfeatures = pd.DataFrame.from_records(playlist.features.dropna().tolist())

In [45]:
playlistfeatures

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.670,0.776,9,-5.874,1,0.0356,0.0184,0.002940,0.1280,0.703,130.128,audio_features,5RYLa5P4qweEAKq5U1gdcK,spotify:track:5RYLa5P4qweEAKq5U1gdcK,https://api.spotify.com/v1/tracks/5RYLa5P4qweE...,https://api.spotify.com/v1/audio-analysis/5RYL...,264253,4
1,0.414,0.404,0,-9.928,0,0.0499,0.2710,0.000000,0.3000,0.224,71.105,audio_features,4u7EnebtmKWzUH433cf5Qv,spotify:track:4u7EnebtmKWzUH433cf5Qv,https://api.spotify.com/v1/tracks/4u7EnebtmKWz...,https://api.spotify.com/v1/audio-analysis/4u7E...,354320,4
2,0.933,0.528,5,-6.472,0,0.1610,0.1120,0.312000,0.1630,0.754,109.967,audio_features,5vdp5UmvTsnMEMESIF2Ym7,spotify:track:5vdp5UmvTsnMEMESIF2Ym7,https://api.spotify.com/v1/tracks/5vdp5UmvTsnM...,https://api.spotify.com/v1/audio-analysis/5vdp...,214653,4
3,0.559,0.868,5,-5.276,1,0.1700,0.0475,0.000176,0.7760,0.609,156.295,audio_features,5T8EDUDqKcs6OSOwEsfqG7,spotify:track:5T8EDUDqKcs6OSOwEsfqG7,https://api.spotify.com/v1/tracks/5T8EDUDqKcs6...,https://api.spotify.com/v1/audio-analysis/5T8E...,209413,4
4,0.671,0.712,2,-7.815,1,0.0476,0.4290,0.000000,0.1030,0.462,113.805,audio_features,2fuCquhmrzHpu5xcA1ci9x,spotify:track:2fuCquhmrzHpu5xcA1ci9x,https://api.spotify.com/v1/tracks/2fuCquhmrzHp...,https://api.spotify.com/v1/audio-analysis/2fuC...,248440,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13872,0.429,0.581,2,-4.839,1,0.0297,0.2110,0.000000,0.0837,0.136,128.104,audio_features,0TrRFrKpAisDp6hAZMliLy,spotify:track:0TrRFrKpAisDp6hAZMliLy,https://api.spotify.com/v1/tracks/0TrRFrKpAisD...,https://api.spotify.com/v1/audio-analysis/0TrR...,267333,4
13873,0.312,0.443,2,-5.640,0,0.0310,0.4590,0.000000,0.1000,0.312,151.005,audio_features,6zPwnKp2kMHP4OFYfrY67q,spotify:track:6zPwnKp2kMHP4OFYfrY67q,https://api.spotify.com/v1/tracks/6zPwnKp2kMHP...,https://api.spotify.com/v1/audio-analysis/6zPw...,273400,3
13874,0.371,0.860,11,-3.263,0,0.0464,0.0226,0.000000,0.3160,0.376,137.405,audio_features,4HHIVS7mHAXqXVebo3k5Um,spotify:track:4HHIVS7mHAXqXVebo3k5Um,https://api.spotify.com/v1/tracks/4HHIVS7mHAXq...,https://api.spotify.com/v1/audio-analysis/4HHI...,240120,4
13875,0.622,0.832,6,-4.289,0,0.0375,0.2400,0.000000,0.2850,0.563,112.024,audio_features,07xh9cA0QBjWfrZ7xQ62hI,spotify:track:07xh9cA0QBjWfrZ7xQ62hI,https://api.spotify.com/v1/tracks/07xh9cA0QBjW...,https://api.spotify.com/v1/audio-analysis/07xh...,183040,4


**Concatenating the features dataframes with that of the song names and artists.**

In [48]:
#reset indices of both dataframes
playlist = playlist.reset_index(drop = True)
playlistfeatures = playlistfeatures.reset_index(drop = True)

#concatenate dataframes and drop unnecessary columns, duplicates
playlist_db = pd.concat([playlist,playlistfeatures], axis = 1)
playlist_db = playlist_db.reset_index(drop = True)
playlist_db = playlist_db.drop(columns = ["features","type","track_href","analysis_url", 'key', 'id', 'uri', 'duration_ms','mode', 'time_signature'], axis=1)
playlist_db 

Unnamed: 0,artist,song,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Queen,A Kind Of Magic - Remastered 2011,0.670,0.776,-5.874,0.0356,0.0184,0.002940,0.1280,0.703,130.128
1,Queen,Bohemian Rhapsody - Remastered 2011,0.414,0.404,-9.928,0.0499,0.2710,0.000000,0.3000,0.224,71.105
2,Queen,Another One Bites The Dust - Remastered 2011,0.933,0.528,-6.472,0.1610,0.1120,0.312000,0.1630,0.754,109.967
3,Queen,Don't Stop Me Now - Remastered 2011,0.559,0.868,-5.276,0.1700,0.0475,0.000176,0.7760,0.609,156.295
4,Queen,Under Pressure - Remastered 2011,0.671,0.712,-7.815,0.0476,0.4290,0.000000,0.1030,0.462,113.805
...,...,...,...,...,...,...,...,...,...,...,...
13872,Chris Cornell,Silence The Voices,0.429,0.581,-4.839,0.0297,0.2110,0.000000,0.0837,0.136,128.104
13873,Chris Cornell,Disappearing Act,0.312,0.443,-5.640,0.0310,0.4590,0.000000,0.1000,0.312,151.005
13874,Chris Cornell,"You Know My Name - From ""Casino Royale"" Soundt...",0.371,0.860,-3.263,0.0464,0.0226,0.000000,0.3160,0.376,137.405
13875,Chris Cornell,Today - Non-LP Version,0.622,0.832,-4.289,0.0375,0.2400,0.000000,0.2850,0.563,112.024


In [47]:
playlist_db.shape

(13877, 11)

## Export dataframe to csv.

In [49]:
playlist_db.to_csv('song_recommender_playlist.csv',index = False)