In [60]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [61]:
# for Spotify API
# getting local environmental files (client id, client secret)
import os
from dotenv import load_dotenv
load_dotenv()

True

## Finding Billboard Top 50 (not 100, so we don't overload Spotify API with requests)


In [62]:
#BeautifulSoup scraping for artist names and song titles
url = 'https://www.billboard.com/charts/hot-100/2022-05-21/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
raw_song_names = soup.find_all('h3', attrs = {'id':'title-of-a-story','class':'a-no-trucate'})
raw_artists = soup.find_all('span', attrs = {'class':'c-label','class':'a-no-trucate'})


In [63]:
song_title = []
song_artist = []

#song titles
for raw_name in raw_song_names:
  song_title.append(str(raw_name.get_text().strip()))

#artist names
#we are doing a couple steps for data wrangling here; replacing all &, "Featuring", and "X"
#these three values will mess with our search queries to spotify API.
#in our dataset, we only have one artist with an X in their name (Lil Nas X)
#so we exempt his name from our replace statement

for raw_artist in raw_artists:
  raw_artist = raw_artist.get_text().replace("&"," ")
  if "Lil Nas X" in raw_artist:
    pass
  else:
    raw_artist = raw_artist.replace("X"," ")
  song_artist.append(str(raw_artist.replace("Featuring"," ").strip()))

In [64]:
#consolidating song title, song_artist into new dataframe, billboard DB
billboard_db = pd.DataFrame()
billboard_db['Titles'] = song_title
billboard_db['Artists'] = song_artist
billboard_db.drop(billboard_db.tail(50).index,inplace=True)

In [65]:
billboard_db

Unnamed: 0,Titles,Artists
0,First Class,Jack Harlow
1,As It Was,Harry Styles
2,Wait For U,Future Drake Tems
3,Moscow Mule,Bad Bunny
4,Titi Me Pregunto,Bad Bunny
5,Despues de La Playa,Bad Bunny
6,Heat Waves,Glass Animals
7,Big Energy,Latto
8,About Damn Time,Lizzo
9,Me Porto Bonito,Bad Bunny Chencho Corleone


Spotify Client Credentials Flow

In [66]:
#Auth keys are stored locally in a .env file, to keep hidden.
CL_id = os.getenv('CL_ID')
CL_secret = os.getenv('CL_SECRET')
AUTH_URL = 'https://accounts.spotify.com/api/token'


In [67]:
#client credentials flow

#request authorization
auth_resp = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CL_id,
    'client_secret': CL_secret,
})

# convert the response to JSON
auth_resp_data = auth_resp.json()

# save the access token
access_token = auth_resp_data['access_token']

In [68]:
# using the access token

#endpoint to hit 
endpoint = 'https://api.spotify.com/v1/search?q='

#creating the headers
headers = {
  'Authorization': 'Bearer ' + access_token,
  'Content-Type': 'application/json'
}

In [69]:
#helper function for get_song_info to convert duration from miliseconds to minute/second readable format
def song_time(duration):
  total_seconds = int(duration / 1000)
  minutes       = int(total_seconds / 60)
  seconds       = int(total_seconds - minutes * 60)
  return ("{}:{:02}".format(minutes, seconds))

In [70]:
# Function to create a dataframe based on the information that we want to receive:
# the popularity score (out of 100), explicity (safe for school?), the duration in minute/seconds, release date, and a preview if you want to listen
# WARNING TO TURN YOUR AUDIO DOWN IF YOU CLICK ON PREVIEW (default sound setting is LOUD).

def get_song_info(data):
  #values that we're looking for
  popularity = []
  release_date = []
  explicit = []
  duration = []
  preview_url = []

  #iterate through the json returned to extract the information and append to empty lists
  for track_info in data['tracks']['items']:
    popularity.append([track_info][0]['popularity'])
    release_date.append([track_info][0]['album']['release_date'])
    explicit.append([track_info][0]['explicit'])
    preview_url.append([track_info][0]['preview_url'])

    #time conversion, get value of ms, convert to a str and pass above helper function
    #append to that list like above
    dur = [track_info][0]['duration_ms']
    min_sec_dur = str(song_time(dur))
    duration.append(min_sec_dur)

  #setting up our dataframe
  album = {
  'popularity': popularity,
  'release_date': release_date,
  'explicit': explicit,
  'duration': duration,
  'preview_url': preview_url
  }
  album_df = pd.DataFrame(album)

  return album_df

In [71]:
# additional helper function for legibility:
# responsible for the request to Spotify API, with the values of title and artist + header
# reads data in json, passes above function to create dataframe with data

def get_spot_info(title,artist):
  r = requests.get(endpoint + title + ' ' + artist + '&type=track' + '&limit=1',headers=headers)
  data = r.json()
  return get_song_info(data=data)


In [72]:
#empty dataframe for our spotify data
spotify_df = pd.DataFrame()

# this is the heart of the program, iterating through the billboard dataframe containing
# the top 50 songs, and passes the song titles and artist names as values in the search query
# this creates spotify_df, a dataframe with all the 5 values (pop,rel date, explicit, dur, preview url)

for i in billboard_db.index:
  title = billboard_db['Titles'][i]
  artist = billboard_db['Artists'][i]
  df = get_spot_info(title=title,artist=artist)
  spotify_df = pd.concat([df,spotify_df],ignore_index=True)


In [73]:
#reordering index, as the concat reverses index.
spotify_df = spotify_df[::-1].reset_index(drop=True)
spotify_df

Unnamed: 0,popularity,release_date,explicit,duration,preview_url
0,94,2022-04-08,True,2:53,https://p.scdn.co/mp3-preview/27913e771fb16c54...
1,100,2022-03-31,False,2:47,https://p.scdn.co/mp3-preview/e9216304e6456a90...
2,86,2022-04-29,True,3:09,https://p.scdn.co/mp3-preview/a225651eb324272a...
3,92,2022-05-06,True,4:05,https://p.scdn.co/mp3-preview/585736d2d4dba1ee...
4,90,2022-05-06,False,4:03,https://p.scdn.co/mp3-preview/a174cca3a21422be...
5,89,2022-05-06,False,3:50,https://p.scdn.co/mp3-preview/9dddf0cfb0bd4f88...
6,85,2020-08-07,False,3:58,
7,78,2021-09-24,True,2:53,https://p.scdn.co/mp3-preview/ab84902c58e0faf0...
8,89,2022-04-14,True,3:11,https://p.scdn.co/mp3-preview/1e881f08e10f2f8d...
9,91,2022-05-06,True,2:58,https://p.scdn.co/mp3-preview/8513b15fb5b39966...


In [74]:
#third dataframe that combines both
top50_df = pd.DataFrame()

# merging billboard and spotify dataframes horizontally, creating a dataframe that contains
# information from both, showing the top 50 titles.
top50_df = pd.concat([billboard_db,spotify_df],axis=1)
top50_df

#another warning to turn volume down if you click the preview URL (spotify default is loud)

Unnamed: 0,Titles,Artists,popularity,release_date,explicit,duration,preview_url
0,First Class,Jack Harlow,94,2022-04-08,True,2:53,https://p.scdn.co/mp3-preview/27913e771fb16c54...
1,As It Was,Harry Styles,100,2022-03-31,False,2:47,https://p.scdn.co/mp3-preview/e9216304e6456a90...
2,Wait For U,Future Drake Tems,86,2022-04-29,True,3:09,https://p.scdn.co/mp3-preview/a225651eb324272a...
3,Moscow Mule,Bad Bunny,92,2022-05-06,True,4:05,https://p.scdn.co/mp3-preview/585736d2d4dba1ee...
4,Titi Me Pregunto,Bad Bunny,90,2022-05-06,False,4:03,https://p.scdn.co/mp3-preview/a174cca3a21422be...
5,Despues de La Playa,Bad Bunny,89,2022-05-06,False,3:50,https://p.scdn.co/mp3-preview/9dddf0cfb0bd4f88...
6,Heat Waves,Glass Animals,85,2020-08-07,False,3:58,
7,Big Energy,Latto,78,2021-09-24,True,2:53,https://p.scdn.co/mp3-preview/ab84902c58e0faf0...
8,About Damn Time,Lizzo,89,2022-04-14,True,3:11,https://p.scdn.co/mp3-preview/1e881f08e10f2f8d...
9,Me Porto Bonito,Bad Bunny Chencho Corleone,91,2022-05-06,True,2:58,https://p.scdn.co/mp3-preview/8513b15fb5b39966...
