# Data Collection: 

## 1. Wikipedia scraper for top 100 song charts, 1958-2021

1. Crawls and scrapes Billboard Top 100 song charts from Wikipedia
2. Generates a csv file with 6350 songs

In [9]:
import requests
from bs4 import BeautifulSoup
import csv
import re

"""
    Crawl and scrape Wikipedia for Billboard top 100 songs from 1958 to 2021.
    We start from 1958 because that is when Billboard officially launched their
    top 100 list. While crawling, script writes to csv file the following features:
        no. = rank of song in given year
        year = release year of song
        song = title of song
        artist(s) = name of song artist(s)
        song_wiki_url = link to Wikipedia page for song
        artist_wiki_url = link to Wikipedia page for artist(s)
        
    Results in a csv file with 6350 songs. 
"""

# Setting starting link for web crawl and domain base
starting_url = "https://en.wikipedia.org/wiki/Billboard_year-end_top_50_singles_of_1957"
base_link = "https://en.wikipedia.org"

# Listing the desired features to be obtained 
col_names = ["no.", "year", "song", "artist(s)", "song_wiki_url", "artist_wiki_url"]

headers = {'User-Agent':'web scraper for class for hyap@uchicago.edu'} 
response = requests.get(starting_url, params=headers)
content = response.content
soup = BeautifulSoup(content, 'html.parser', from_encoding='UTF-8')



In [10]:
# From table on starting url page, use regex to find all Billboard years HTML list elements.
billboard_years = soup.find('table', class_="nowraplinks").findAll('li', string=re.compile(r"^[\d]{4}$"))

# Opens csv file to writing mode
with open("top100_1958_to_2021.csv", "w") as f:
    csv_writer = csv.writer(f, delimiter=",")
    csv_writer.writerow(col_names)
    # Crawl through table of Billboard year pages
    for year_pg in billboard_years:
        year = year_pg.text
        if int(year) > 1957:
            # Get relative link from page and convert to absolute link
            link = year_pg.find('a')['href']
            url = base_link + link
            # Crawl link and find relative content from the top 100 table
            response = requests.get(url, params=headers)
            content = response.content
            soup = BeautifulSoup(content, 'html.parser', from_encoding='UTF-8')
            hot_table = soup.find('table', class_ = 'wikitable')
            for row in hot_table.findAll("tr"):
                row_cont = row.text.strip('\n').split('\n')
                row_cont = [i for i in row_cont if len(i) > 0]
                cols = row.findAll('td')
                # Checks the numbers of columns as method for ensuring the desired information is being pulled
                if len(cols) == 3:
                    song_links = [base_link + link["href"] for link in cols[1].findAll('a', href=True)]
                    artist_links = [base_link + link["href"] for link in cols[2].findAll('a', href=True)]
                    row_cont.insert(1, year)
                    row_cont = row_cont + [song_links] + [artist_links]
                    if len(row_cont) == 6: 
                        csv_writer.writerow(row_cont)