Using Beautiful Soup, we will scrape billboard year-end hot 100 songs from 2011 to 2020.

In [1]:
# Import required packages
import os
import pickle
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup

In [4]:
# Scrape data
result = []
for year in range(2001, 2022):
    print("Scraping for year: {}".format(year))
    url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_'+ str(year)
    res = requests.get(url).text # Get HTML page
    bsoup = soup(res,'html.parser') # Create object of BeautifulSoup
    
    table = bsoup.find("table", class_= 'wikitable sortable') # Table class is wikitable sortable
    table_body = table.find('tbody') # Find tbody tag
    rows = table_body.find_all('tr') # Find all rows in table
    
    # For each song row for a year
    for row in rows:
        try:
            # For each row get all column values
            columns = row.find_all("td") # There are more than one `td` tag
            columns = [elem.text.strip() for elem in columns] # To get text inside each td
            result.append([columns[1], columns[2], year])
        except Exception as e:
            pass

Scraping for year: 2001
Scraping for year: 2002
Scraping for year: 2003
Scraping for year: 2004
Scraping for year: 2005
Scraping for year: 2006
Scraping for year: 2007
Scraping for year: 2008
Scraping for year: 2009
Scraping for year: 2010
Scraping for year: 2011
Scraping for year: 2012
Scraping for year: 2013
Scraping for year: 2014
Scraping for year: 2015
Scraping for year: 2016
Scraping for year: 2017
Scraping for year: 2018
Scraping for year: 2019
Scraping for year: 2020
Scraping for year: 2021


In [5]:
df = pd.DataFrame(result)
df.columns=["song_name", "artists_name", "year"]
df.head()

Unnamed: 0,song_name,artists_name,year
0,"""Hanging by a Moment""",Lifehouse,2001
1,"""Fallin'""",Alicia Keys,2001
2,"""All for You""",Janet,2001
3,"""Drops of Jupiter (Tell Me)""",Train,2001
4,"""I'm Real (Murder Remix)""",Jennifer Lopez featuring Ja Rule,2001


In [7]:
# Each song contains double quotes, so replacing it with ''
def remove_quotes(text):
    return text.replace('"', '')

df["song_name"] = df["song_name"].apply(remove_quotes)
df

Unnamed: 0,song_name,artists_name,year
0,Hanging by a Moment,Lifehouse,2001
1,Fallin',Alicia Keys,2001
2,All for You,Janet,2001
3,Drops of Jupiter (Tell Me),Train,2001
4,I'm Real (Murder Remix),Jennifer Lopez featuring Ja Rule,2001
...,...,...,...
2095,Things a Man Oughta Know,Lainey Wilson,2021
2096,Throat Baby (Go Baby),BRS Kash,2021
2097,Tombstone,Rod Wave,2021
2098,Drinkin' Beer. Talkin' God. Amen.,Chase Rice featuring Florida Georgia Line,2021


In [8]:
# Checking for duplicates and removing them

print("Shape of DataFrame: ", df.shape)
print("Dropping duplicates:")
df = df.drop_duplicates()
print("Shape of DataFrame after removing duplicates: ", df.shape)

Shape of DataFrame:  (2100, 3)
Dropping duplicates:
Shape of DataFrame after removing duplicates:  (2100, 3)


In [9]:
# Refer: https://medium.com/@maxtingle/getting-started-with-spotifys-api-spotipy-197c3dc6353b
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

SPOTIPY_CLIENT_ID = 'd3b37b560a43499ea37424bef60c1d3e' 
SPOTIPY_CLIENT_SECRET = '008ef413d77546a2acffd71f694d180c'

client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [10]:
def get_artist_name(row):
    if 'featuring' in row['artists_name']:
        val = row['artists_name'].split("featuring")[0]
    else: 
        val = row['artists_name']
    return val
 
# df1['featuring'] = df1.apply(featuring_, axis=1)
df['artist_name'] = df.apply(get_artist_name, axis=1)
df

Unnamed: 0,song_name,artists_name,year,artist_name
0,Hanging by a Moment,Lifehouse,2001,Lifehouse
1,Fallin',Alicia Keys,2001,Alicia Keys
2,All for You,Janet,2001,Janet
3,Drops of Jupiter (Tell Me),Train,2001,Train
4,I'm Real (Murder Remix),Jennifer Lopez featuring Ja Rule,2001,Jennifer Lopez
...,...,...,...,...
2095,Things a Man Oughta Know,Lainey Wilson,2021,Lainey Wilson
2096,Throat Baby (Go Baby),BRS Kash,2021,BRS Kash
2097,Tombstone,Rod Wave,2021,Rod Wave
2098,Drinkin' Beer. Talkin' God. Amen.,Chase Rice featuring Florida Georgia Line,2021,Chase Rice


In [11]:
df['search_query'] = df['song_name'] + " " + df["artist_name"]
df.head()

Unnamed: 0,song_name,artists_name,year,artist_name,search_query
0,Hanging by a Moment,Lifehouse,2001,Lifehouse,Hanging by a Moment Lifehouse
1,Fallin',Alicia Keys,2001,Alicia Keys,Fallin' Alicia Keys
2,All for You,Janet,2001,Janet,All for You Janet
3,Drops of Jupiter (Tell Me),Train,2001,Train,Drops of Jupiter (Tell Me) Train
4,I'm Real (Murder Remix),Jennifer Lopez featuring Ja Rule,2001,Jennifer Lopez,I'm Real (Murder Remix) Jennifer Lopez


In [12]:
ids = []
acousticness = []
artist_name = []
danceability = []
duration_ms = []
energy = []
instrumentalness = []
key_list = []
liveness = []
loudness = []
mode_list = []
pop_track = []
rel_date = []
speechiness = []
tempo = []
track_name = []
valence = []

count = 0 

def get_track_info(search):
    global count, search_cache, audioFeat_cache, artist_cache
    tracks = []
    try:
        if count % 200 == 0:
            print("Processing...")
        # Search the track
        count+=1
        if search in search_cache:
            track_results = search_cache[search]
        else:
            track_results = sp.search(q=search, type='track', limit=1, offset=0)
            search_cache[search] = track_results
        tracks.append(track_results['tracks']['items'])

        track_id = tracks[0][0]['id']
        ids.append(track_id)
        track_uri = tracks[0][0]['uri']
        artist_uri = tracks[0][0]['album']['artists'][0]['uri']
        if artist_uri in artist_cache:
            artist = artist_cache[artist_uri]
        else:
            artist = sp.artist(artist_uri)
            artist_cache[artist_uri] = artist
        artist_name.append(artist['name'])
        # Track info
        track_name.append(tracks[0][0]['name'])
        rel_date.append(tracks[0][0]['album']['release_date'])
        pop_track.append(tracks[0][0]['popularity'])

        if track_uri in audioFeat_cache:
            audio_features = audioFeat_cache[track_uri]
        else:
            audio_features = sp.audio_features(track_uri)[0]
            audioFeat_cache[track_uri] = audio_features

        if audio_features:
            # Artist info
            acousticness.append(audio_features['acousticness'])
            danceability.append(audio_features['danceability'])
            duration_ms.append(audio_features['duration_ms'])
            
            energy.append(audio_features['energy'])
            instrumentalness.append(audio_features['instrumentalness'])
            key_list.append(audio_features['key'])
            liveness.append(audio_features['liveness'])
            loudness.append(audio_features['loudness'])
            mode_list.append(audio_features['mode'])
            speechiness.append(audio_features['speechiness'])
            tempo.append(audio_features['tempo'])
            valence.append(audio_features['valence'])
        else:
            acousticness.append(np.NaN)
            danceability.append(np.NaN)
            duration_ms.append(np.NaN)
            energy.append(np.NaN)
            instrumentalness.append(np.NaN)
            key_list.append(np.NaN)
            liveness.append(np.NaN)
            loudness.append(np.NaN)
            mode_list.append(np.NaN)
            speechiness.append(np.NaN)
            tempo.append(np.NaN)
            valence.append(np.NaN)
    except:
        pass


search_cache = {}
audioFeat_cache = {}
artist_cache = {}
try:
    search_cache_file = "cache/search.cache"
    audioFeat_cache_file = "cache/audioFeat.cache"
    artist_cache_file = "cache/artist.cache"
    if os.path.exists(search_cache_file):
        with open(search_cache_file, 'rb') as cache1:
            search_cache = pickle.load(cache1)

    if os.path.exists(audioFeat_cache_file):
        with open(audioFeat_cache_file, 'rb') as cache2:
            audioFeat_cache = pickle.load(cache2)

    if os.path.exists(artist_cache_file):
        with open(artist_cache_file, 'rb') as cache3:
            artist_cache = pickle.load(cache3)
        
    df['search_query'].map(get_track_info)

except Exception as e:
    print(e)

finally:
    with open(search_cache_file, 'wb') as cache1:
        pickle.dump(search_cache, cache1)
        print('Cache-1 saved with size', len(search_cache))
        search_cache.clear()

    with open(audioFeat_cache_file, 'wb') as cache2:
        pickle.dump(audioFeat_cache, cache2)
        print('Cache-2 saved with size', len(audioFeat_cache))
        audioFeat_cache.clear()
        
    with open(artist_cache_file, 'wb') as cache3:
        pickle.dump(artist_cache, cache3)
        print('Cache-3 saved with size', len(artist_cache))
        artist_cache.clear()  

Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Processing...
Cache-1 saved with size 1909
Cache-2 saved with size 1902
Cache-3 saved with size 739


In [13]:
df1 = pd.DataFrame({'id':ids,'artist_name':artist_name,'key':key_list,'mode':mode_list,'track_name':track_name,
                   'rel_date':rel_date,'popularity':pop_track,'acousticness':acousticness,
                   'danceability':danceability,'duration_ms':duration_ms,'energy':energy,'instrumentalness':instrumentalness,
                   'liveness':liveness,'loudness':loudness,'speechiness':speechiness,
                   'tempo':tempo,'valence':valence
                  })

In [14]:
df1

Unnamed: 0,id,artist_name,key,mode,track_name,rel_date,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,0wqOReZDnrefefEsrIGeR4,Lifehouse,1.0,1.0,Hanging By A Moment,2000,68,0.00118,0.541,216067.0,0.864,0.000000,0.0896,-4.915,0.0357,124.557,0.435
1,0KQx6HOpJueiSkztcS0r7D,Alicia Keys,11.0,0.0,Fallin',2001-06-05,70,0.26300,0.652,210200.0,0.609,0.001010,0.2330,-7.519,0.0370,95.986,0.482
2,5X8kkUaUlAyAUr9TYqDFTH,Janet Jackson,2.0,1.0,All For You,2001-01-01,67,0.01740,0.753,329933.0,0.934,0.065000,0.1280,-3.011,0.0736,113.525,0.730
3,2hKdd3qO7cWr2Jo0Bcs0MA,Train,0.0,1.0,Drops of Jupiter (Tell Me),2001-03-27,80,0.15300,0.481,259933.0,0.638,0.000000,0.1540,-5.862,0.0276,79.064,0.497
4,4onVfPBjiLokGWsGRdPH7v,Ja Rule,2.0,1.0,I'm Real - Murder Remix,2001-10-02,49,0.40100,0.700,252933.0,0.631,0.000000,0.1190,-6.085,0.1070,83.403,0.558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,5QS8PNEWbqTEZyQ6e9ZbJf,Lainey Wilson,3.0,1.0,Things A Man Oughta Know,2021-02-19,72,0.51300,0.659,203373.0,0.683,0.000005,0.1330,-5.623,0.0312,139.931,0.397
2093,15C4TnrrVdym7UykxQIOTZ,BRS Kash,1.0,0.0,Throat Baby (Go Baby) (with DaBaby & City Girl...,2021-01-21,63,0.00984,0.878,211610.0,0.475,0.000000,0.1670,-8.420,0.3240,131.988,0.397
2094,3zc8VZEpM1onYV4FWGdFvm,Rod Wave,8.0,1.0,Tombstone,2021-03-26,73,0.59700,0.550,160078.0,0.637,0.000000,0.1290,-5.212,0.1630,84.448,0.535
2095,1UYfAU2bwgjaM5rIIPQleC,Chase Rice,2.0,1.0,Drinkin' Beer. Talkin' God. Amen. (feat. Flori...,2020-11-30,67,0.18500,0.627,160839.0,0.678,0.000000,0.3740,-4.691,0.0294,100.032,0.724


In [16]:
df1.to_csv("data/billboard_scraped.csv")