## Lyrics Scraping Start to Finish ##

For the last quiz, you likely did something like this...

In [None]:
# import libraries we'll need

import requests
from bs4 import BeautifulSoup

In [None]:
# get the contents of the candidate playlist page

resp = requests.get("https://www.nytimes.com/interactive/2019/08/19/us/politics/presidential-campaign-songs-playlists.html") 

In [None]:
# convert it to string and just make sure it worked
html_str = resp.text

html_str

In [None]:
# load it into beautiful soup

document = BeautifulSoup(html_str, "html.parser")

In [None]:
# find all the song title tags and see what they look like

title_tags = document.find_all("span", attrs={"class": "song-title"})

title_tags

In [None]:
# now loop through the title tags, looking for the associated artist
# store both in a dictionary in the format {song title, artist}

song_dict = {} 

for title in title_tags:
    # get the string associated w/ the tag
    song_title = title.string
    # while we're looking at that title, look for the next sibling
    artist_name = title.find_next_sibling('span')
    # add it to the dict (and be sure to add only the string)
    song_dict[song_title] = artist_name.string
    
song_dict

In [None]:
# now it's time to hook in the API 

# before anything else, define your API key to authorize your request when using the API:

client_access_token = " INSERT API KEY HERE!!! " # Replace with your token

In [None]:
# now, iterate through our song_dict and get our lyrics URLs
# store it in another dict for future reference 
# (though for the purposes of the quiz, you could just print out the URL inside the loop and be done)

url_dict = {}

for item in song_dict:
    song_title = item
    song_artist = song_dict[item]
        
    # Remember this is the URL for a search via the Genius API:
    genius_search_url = f'http://api.genius.com/search?q={song_title}&access_token={client_access_token}'

    # here's the API call
    resp = requests.get(genius_search_url)
    data = resp.json()
    
    # now search for match w/ artist 
    for song in data['response']['hits']:
        if song['result']['primary_artist']['name'] == song_artist:
            # if there's a match, get the url
            lyrics_url = song['result']['url']

            # here you could have just printed out the info at this point for Quiz 2
            # but in this case we're going to add it to the url dict in format {song title, url}
            # so that we can use it again later 
            url_dict[song_title] = lyrics_url
            
            # also give a status update so we can make sure we're on the right track...
            print("Matched! Artist: " + song_artist + " and title: " + song_title)
 
            # break out of for loop if we've got a match 
            break
            
url_dict

In [None]:
# now we've got our URL dict so we can start scraping lyrics!

import re

for item in url_dict:
    song_title = item
    song_artist = song_dict[item]   # note that later in the semester, we'll learn
                                    # another method of storing multidimensional data
                                    # for now, though, we'll just keep these two dicts
    song_url = url_dict[item]  

    # now this is just what you did for your first quiz
    # get the contents of the lyrics page
    print("Requesting: " + song_url)
    resp = requests.get(song_url) 
    html_str = resp.text

    # turn it into a BS object and get the lyrics div
    document = BeautifulSoup(html_str, "html.parser")

    # this is all you used to need to do to scrape the lyrics;  
    # note that this is how simple the web-scraping used to be! 
    # lyrics = document.find('p').text

    # now it's more complicated and in order to automate it, we need... regex!
    # what we're looking for is something called "Lyrics__Container-xx-xxxxxxx-x xxxxxx"
    # we can abstract this into Lyrics__Container and then any chars that follow, so... 
    
    # let's do it -- here are some clues!  
    
    # ^ matches the start of a string
    # . for any character except for end-of-line
    # * for matching the "." zero or more times; and then
    # $ matches the end of the line

    lyrics_divs = document.find_all("div", attrs={"class": re.compile(" INSERT REGEX HERE!!! ")})


    
    
    # now let's set up a list to store our lyrics
    lyrics = []

    for div in lyrics_divs:
        lyrics.append(div.get_text(separator='\n'))
    
    # let's do some quick cleaning, just like we did before... regex again 
    cleanest_lyrics = []
    
    for chunk in lyrics:
        # remove square brackets as we did last class
        cleaner_chunk = re.sub("\[.*\]", "", chunk)
        # remove leading newlines as we did last class
        cleanest_chunk = re.sub("^[\n]{2}","", cleaner_chunk)
        # append the final version to the lyrics list
        cleanest_lyrics.append(cleanest_chunk)

    # store it in a file

    # we'll use the final part of the URL as a filename 
    filename = song_url.replace("https://genius.com/","")
    filename = filename.replace("-lyrics","") + ".txt"
    
    # store the lyrics in a dir called "my-lyrics"; 
    path = "my-lyrics/" + filename
    
    # IMPORTANT: "my-lyrics" dir needs to exist first, otherwise it will break  
    # you need to go to the "my-work" directory and create a new directory inside  
    # that one called 'my-lyrics'
    
    with open(path, "w") as file:
        file.writelines(cleanest_lyrics)
        print("Wrote lyrics to: " + filename)

Hurray! We got our lyrics!

PS: The regex tweet I wanted to share with you the other day:

<img src="http://lklein.com/wp-content/uploads/2019/09/Screen-Shot-2019-09-18-at-11.21.11-AM.png">