In [196]:
import numpy as np
import pandas as pd
import requests
import base64
import itertools
import spotipy
from spotipy.oauth2 import SpotifyOAuth

api_keys = pd.read_csv('apikeys.csv',header=None)
api_keys = api_keys.iloc[0].to_numpy()

# Data Collection

In [242]:
#Spotify api
def refresh_key(CLIENT_ID, CLIENT_SECRET):
    # Base64 encode the client ID and client secret
    client_credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"
    client_credentials_base64 = base64.b64encode(client_credentials.encode())

    # Request the access token
    token_url = 'https://accounts.spotify.com/api/token'
    headers = {
        'Authorization': f'Basic {client_credentials_base64.decode()}'
    }
    data = {
        'grant_type': 'client_credentials'
    }

    response = requests.post(token_url, data=data, headers=headers)

    if response.status_code == 200:
        access_token = response.json()['access_token']
        print("Access token obtained successfully.")
    else:
        print("Error obtaining access token.")
        exit()

    #spotify access
    sp = spotipy.Spotify(auth=access_token)
    return(sp)

sp = refresh_key(api_keys[0],api_keys[1])

Access token obtained successfully.


In [240]:
#given track id, return relevant info
def get_track_info(track_id):
    track = sp.track(track_id)
    artist_id = track['album']['artists'][0]['id']
    artist = sp.artist(artist_id)

    #split obscure genres into (hopefully) recognisable ones
    artist_genres = artist['genres']
    artist_genres = [s.split() for s in artist_genres]
    artist_genres = list(itertools.chain.from_iterable(artist_genres))
    artist_genres = list(set(artist_genres))
    #check if in top 10 categories
    top_genres = ['rap', 'pop', 'rock', 'country', 'edm', 'latin', 'k-pop', 'r&b', 'classical', 'indie']
    artist_top_genre = list(set(top_genres).intersection(artist_genres))

    if len(artist_top_genre) != 0:
        g = len(artist_top_genre) #adjust for multi-genre artists
    else: g = 1; artist_top_genre = 'other'; print(artist_genres)

    audio_features = sp.audio_features(track_id)[0]

    track_info = pd.DataFrame({
        #track info
        'track_id': [track_id] * g,
        'track_name': [track['name']] * g,
        'artist_id': [artist_id] * g,
        'artist_name': [track['album']['artists'][0]['name']] * g,
        'artist_genre': artist_top_genre,
        'release_date': [track['album']['release_date']] * g,
        'popularity': [track['popularity']] * g,

        #song data
        'explicit': track['explicit'],
        'danceability': audio_features['danceability'],
        'energy': audio_features['energy'],
        'key': audio_features['key'],
        'loudness': audio_features['loudness'] ,
        'mode': audio_features['mode'],
        'speechiness': audio_features['speechiness'],
        'acousticness': audio_features['acousticness'],
        'instrumentalness': audio_features['instrumentalness'],
        'liveness': audio_features['liveness'],
        'valence': audio_features['valence'],
        'tempo': audio_features['tempo']
    })
    return(track_info)

In [244]:
#given playlist id, return info on all tracks
def get_playlist_info(playlist_id):
    playlist = sp.playlist(playlist_id)
    length_pl = playlist['tracks']['total']

    df = get_track_info(playlist['tracks']['items'][0]['track']['id'])
    for i in range(1,length_pl):
        df = pd.concat([df, get_track_info(playlist['tracks']['items'][i]['track']['id'])])
        
    return(df.reset_index())

#get_playlist_info('1W7icVcOm0IFXdvTpcdE3D')

[]
[]


Unnamed: 0,index,track_id,track_name,artist_id,artist_name,artist_genre,release_date,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,pop,2019-06-14,42,False,0.748,0.916,6,-2.634,1,0.0583,0.102,0,0.0653,0.518,122.036
1,0,745LUoE5rQZ51Tz4WnBoNp,Try Again (feat. A7S),4cSYNpczcvTUpnPMFDLsIc,Raaban,pop,2019-05-03,29,False,0.707,0.856,0,-3.861,0,0.205,0.142,0,0.124,0.51,126.914
2,1,745LUoE5rQZ51Tz4WnBoNp,Try Again (feat. A7S),4cSYNpczcvTUpnPMFDLsIc,Raaban,edm,2019-05-03,29,False,0.707,0.856,0,-3.861,0,0.205,0.142,0,0.124,0.51,126.914
3,0,1mXuMM6zjPgjL4asbBsgnt,Firework,6jJ0s89eD6GaHleKKya26X,Katy Perry,pop,2010-08-24,68,False,0.638,0.831,8,-5.039,1,0.049,0.142,0,0.113,0.649,124.071
4,0,3OXpOuqxPaFDO9rLNn1Jvb,Cuddle Up,5nfow6tv4Dtm6K4WHzczBI,Catey Shaw,indie,2014-11-27,16,False,0.692,0.703,0,-5.195,1,0.0308,0.108,0,0.109,0.496,124.989
5,0,18W92Zm1KjLCbUIszOhpkD,I Wanna Know (feat. Bea Miller),5jAMCwdNHWr7JThxtMuEyy,NOTD,pop,2018-03-16,61,False,0.661,0.725,6,-4.859,1,0.0563,0.0253,0,0.123,0.605,119.927
6,1,18W92Zm1KjLCbUIszOhpkD,I Wanna Know (feat. Bea Miller),5jAMCwdNHWr7JThxtMuEyy,NOTD,edm,2018-03-16,61,False,0.661,0.725,6,-4.859,1,0.0563,0.0253,0,0.123,0.605,119.927
7,0,43wzy7JxMEfvCh8ZFeZKYk,Right Now,4Rxn7Im3LGfyRkY2FlHhWi,Nick Jonas,pop,2018-08-24,54,False,0.597,0.751,6,-4.982,0,0.193,0.0105,0,0.0913,0.567,103.954
8,0,47Slg6LuqLaX0VodpSCvPt,Just the Way You Are,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,pop,2010-05-11,79,False,0.635,0.841,5,-5.379,1,0.0422,0.0134,0,0.0622,0.424,109.021
9,0,3AsOKLwiG1nEMCFcyiC3tM,Get What You Give,6roDXEmZ6AARdOUv6x5U2v,Felix Cartal,pop,2017-06-09,21,False,0.626,0.748,1,-5.899,1,0.0345,0.0143,0,0.123,0.222,121.067


In [295]:
#generate large dataset
#web scraping
from bs4 import BeautifulSoup
from string import digits


#web scraping
url = 'https://kworb.net/spotify/songs.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
titles = soup.find_all('div')[5:]
# #formatting strings
# songname = titles[1].get_text().split(" - ")
# songname[1] = songname[1].translate(str.maketrans('', '', digits)).replace(",", "")

# songs_df = pd.DataFrame({
#     'artist': [songname[0]],
#     'song_title': [songname[1]]
# })

# for i in range(2,len(titles)):
#     songname = titles[i].get_text().split(" - ")
#     songname[1] = songname[1].translate(str.maketrans('', '', digits)).replace(",", "")
#     temp_df = pd.DataFrame({
#         'artist': [songname[0]],
#         'song_title': [songname[1]]
#         })
#     songs_df = pd.concat([songs_df, temp_df])

# songs_df

In [296]:
for i in range(0,len(titles)):
    titles[i].replace("<div>", "").replace("</div>", "")


TypeError: 'NoneType' object is not callable

Sources

https://medium.com/@shruti.somankar/building-a-music-recommendation-system-using-spotify-api-and-python-f7418a21fa41

https://www.unchainedmusic.io/blog-posts/top-music-genres-in-order-the-most-popular-genres-worldwide

https://kworb.net/spotify/songs.html