## Using Beautiful Soup to Webscrape

In [1]:
from bs4 import BeautifulSoup
import requests #to load the page 
import csv
import time
import pandas as pd
import datetime
from datetime import timedelta

from dateutil import parser
from tqdm import tqdm #good to have

import selenium
from selenium import webdriver ## webdriver is the tool to interact with the webpage 

In [2]:
#Initiate target dictionary we want from billboard. This will be turned into a dataframe after scraping from billboard.

dict = {
    "id": [],
    "Song title": [],
    "Artist": [],
    "Current Ranking": [],
    "Number of weeks on billboard": [],
    "Peak Ranking": [],
    "Date": []
}

In [3]:
#Create a list of sundays in 2022
dates = []
date = datetime.date(2022, 1, 1)
while (date.year == 2022):
    dates.append(date)
    date = date + timedelta(days = 7)


In [4]:
#define helper functions

#converts datetime to required date format for url
def convert_date(date):
    year = str(date.year)
    month = str(date.month)
    day = str(date.day)
    if len(year) == 1:
        year = "0" + year
    if len(month) == 1:
        month = "0" + month
    if len(day) == 1:
        day = "0" + day
    return year + "-" + month + "-" + day +"/"

#updates dictionary of songs

def update_dict(d, ids, titles, artists, rankings, weeks, peaks, dates):
    d["id"].extend(ids)
    d["Song title"].extend(titles)
    d["Artist"].extend(artists)
    d["Current Ranking"].extend(rankings)
    d["Number of weeks on billboard"].extend(weeks)
    d["Peak Ranking"].extend(peaks)
    d["Date"].extend(dates)

def scrape_page(url,date, id):
    #initialize target lists
    ids = list(range(id + 1, id + 201))
    dates = [date]*200
    titles = []
    artists = []
    rankings = list(range(1, 201))
    weeks = []
    peaks = []

    #get page
    response = requests.get(url)
    page = BeautifulSoup(response.text, 'html.parser')

    #get titles
    #get first title
    title = page.find("h3", attrs = {"class":"c-title a-font-primary-bold-l a-font-primary-bold-m@mobile-max lrv-u-color-black u-color-white@mobile-max lrv-u-margin-r-150"}).find("a").string.strip()
    titles.append(title)
    scrape_titles = page.find_all("h3", attrs = {"class": "c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only"})
    for t in scrape_titles:
        title = t.string.strip()
        titles.append(title)
    
    #get artists
    #get first artist
    artist = page.find("span", attrs = {"class": "c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet"}).string.strip()
    artists.append(artist)
    scrape_artists = page.find_all("span", attrs = {"class": "c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only"})
    for a in scrape_artists:
        artist = a.string.strip()
        artists.append(artist)

    #get peak and weeks for first song
    weeks1 = page.find_all("span", attrs = {"class": "c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet"})
    lst = []
    for i in weeks1:
        lst.append(i.string.strip())
    peaks.append(lst[1])
    weeks.append(lst[2])

    #get peak and weeks for rest of the songs
    weeks2 = page.find_all("span", attrs = {"class":"c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max"})
    lst2 = []
    for i in weeks2:
        lst2.append(i.string.strip())
    for i in range(0, 1190, 6):
        peaks.append(lst2[i + 1])
        weeks.append(lst2[i + 2])
    update_dict(dict, ids, titles, artists, rankings, weeks, peaks, dates)

In [5]:
counter = 0
for date in dates:
    url = "https://www.billboard.com/charts/billboard-global-200/" + convert_date(date)
    print(url)
    response = requests.get(url)
    page = BeautifulSoup(response.text, 'html.parser') 
    scrape_page(url, date, counter)
    counter += 200

https://www.billboard.com/charts/billboard-global-200/2022-01-01/
https://www.billboard.com/charts/billboard-global-200/2022-01-08/
https://www.billboard.com/charts/billboard-global-200/2022-01-15/
https://www.billboard.com/charts/billboard-global-200/2022-01-22/
https://www.billboard.com/charts/billboard-global-200/2022-01-29/
https://www.billboard.com/charts/billboard-global-200/2022-02-05/
https://www.billboard.com/charts/billboard-global-200/2022-02-12/
https://www.billboard.com/charts/billboard-global-200/2022-02-19/
https://www.billboard.com/charts/billboard-global-200/2022-02-26/
https://www.billboard.com/charts/billboard-global-200/2022-03-05/
https://www.billboard.com/charts/billboard-global-200/2022-03-12/
https://www.billboard.com/charts/billboard-global-200/2022-03-19/
https://www.billboard.com/charts/billboard-global-200/2022-03-26/
https://www.billboard.com/charts/billboard-global-200/2022-04-02/
https://www.billboard.com/charts/billboard-global-200/2022-04-09/
https://ww

In [6]:
data = pd.DataFrame(dict)
data.to_csv("billboard.csv")