In [None]:
# import libraries we'll need

import requests
from bs4 import BeautifulSoup

In [None]:
# get the contents of the candidate playlist page

resp = requests.get("https://www.nytimes.com/interactive/2019/08/19/us/politics/presidential-campaign-songs-playlists.html") 

In [None]:
# convert it to string and just make sure it worked
html_str = resp.text

html_str

In [None]:
# load it into beautiful soup

document = BeautifulSoup(html_str, "html.parser")

In [None]:
# find all the song title tags and see what they look like

title_tags = document.find_all("span", attrs={"class": "song-title"})

title_tags

In [None]:
# now let's make a dictionary in the format {song title, artist}

song_dict = {} 

for title in title_tags:
    # get the string associated w/ the tag
    song_title = title.string
    # while we're looking at that title, look for the next sibling
    artist_name = title.find_next_sibling('span')
    # add it to the dict (and be sure to add only the string)
    song_dict[song_title] = artist_name.string
    
song_dict

In [None]:
# now it's time to put our API knowledge to use

# first, define your API key to authorize your request when using the API:
client_access_token = "zK66h3XvLu38c8sH0ZHu2DogDZJfBUyaE2dhuXGNhgoxbZWOjO7tNf2J9qedMbSv" # Replace with your token

In [None]:
# now, iterate through and get our lyrics URLs

url_dict = {}

for item in song_dict:
    song_title = item
    song_artist = song_dict[item]
        
    # Remember this is the URL for a search via the Genius API:
    genius_search_url = f'http://api.genius.com/search?q={song_title}&access_token={client_access_token}'

    # here's the API call
    resp = requests.get(genius_search_url)
    data = resp.json()
    
    # now search for match w/ artist 
    for song in data['response']['hits']:
        if song['result']['primary_artist']['name'] == song_artist:
            # if there's a match, get the url
            lyrics_url = song['result']['url']

            # add it to the url dict in format {song title, url}
            url_dict[song_title] = lyrics_url
            
            # just give a status update
            print("Matched artist: " + song_artist + " and title: " + song_title)
 
            # break out of for loop since we've got a match 
            break
            
url_dict

In [None]:
# now we've got our URL dict so we can start scraping lyrics!

import re

for item in url_dict:
    song_title = item
    song_artist = song_dict[item]   # note that later in the semester, we'll learn
                                    # another method of storing multidimensional data
                                    # for now, though, we'll just keep these two dicts
    song_url = url_dict[item]  

    # get the contents of the lyrics page
    resp = requests.get(song_url) 
    html_str = resp.text

    # turn it into a BS object and get the lyrics div
    document = BeautifulSoup(html_str, "html.parser")
    lyrics = document.find('p').text
    
    # let's do some quick cleaning 
    cleaner_lyrics = re.sub("\[.*\]\n", "", lyrics) # remove square brackets and contents
    cleanest_lyrics = re.sub("^[\n]{3}","", cleaner_lyrics) # remove the three opening newlines

    # store it in a file

    # we'll use the final part of the URL as a filename 
    print(song_url)
    filename = song_url.replace("https://genius.com/","")
    print(filename)
    filename = filename.replace("-lyrics","") + ".txt"
    print(filename)
    
    # store the lyrics in a subdir called "lyrics"; 
    path = "lyrics/" + filename
    
    # IMPORTANT: "lyrics" dir needs to exist first, otherwise it will break  
    # you need to go to the directory on your computer that holds  
    # this Jupyter notebook (class8-lyrics-scraping-inclass-ds) and create a new folder called 'lyrics'
    
    with open(path, "w") as file:
        file.writelines(cleanest_lyrics)
        print("Wrote lyrics to: " + filename)

Hurray! We got our lyrics!