# Final Project - Lainie Cederholm, Will Novak, Ian Pompliano

**Part 1: Scrape top 50 songs of each year (1960 to present) from Billboard Year-End Hot 100 singles (Wikipedia)**

In [8]:
import requests
from bs4 import BeautifulSoup

# function to scrape top 50 songs for a given year
def scrapeBillboardYear(year):
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    response = requests.get(url)
    
    # takes HTML content of web-page and parses it using parser provided by BeautifulSoup 
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # takes first table from given page (the table we are interested in)
    tables = soup.find_all('table', class_='wikitable')
    songsTable = tables[0]
    
    # excludes first row containing headers
    rows = songsTable.find_all('tr')[1:]
    
    # initialize list of top songs
    topSongs = []
    
    # iterates through first 50 songs and appends song list
    for row in rows[:50]:  
            columns = row.find_all(['td', 'th'])
            songInfo = [col.get_text(strip=True) for col in columns]
            topSongs.append(songInfo)
            
    return topSongs

In [16]:
# create lists that will hold all song data for each decade
list1960s = []
list1970s = []
list1980s = []
list1990s = []
list2000s = []
list2010s = []

# append respective lists with song/artist data 
for year in range(1960, 1970):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1960s.append(data)
for year in range(1970, 1980):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1970s.append(data)
for year in range(1980, 1990):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1980s.append(data)
for year in range(1990, 2000):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1990s.append(data)
for year in range(2000, 2010):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list2000s.append(data)
for year in range(2010, 2020):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list2010s.append(data)

In [18]:
print(list1960s)

[['1', '"Theme from A Summer Place"', 'Percy Faith'], ['2', '"He\'ll Have to Go"', 'Jim Reeves'], ['3', '"Cathy\'s Clown"', 'The Everly Brothers'], ['4', '"Running Bear"', 'Johnny Preston'], ['5', '"Teen Angel"', 'Mark Dinning'], ['6', '"I\'m Sorry"', 'Brenda Lee'], ['7', '"It\'s Now or Never"', 'Elvis Presley'], ['8', '"Handy Man"', 'Jimmy Jones'], ['9', '"Stuck on You"', 'Elvis Presley'], ['10', '"The Twist"', 'Chubby Checker'], ['11', '"Everybody\'s Somebody\'s Fool"', 'Connie Francis'], ['12', '"Wild One"', 'Bobby Rydell'], ['13', '"Greenfields"', 'The Brothers Four'], ['14', '"What in the World\'s Come Over You"', 'Jack Scott'], ['15', '"El Paso"', 'Marty Robbins'], ['16', '"Alley Oop"', 'The Hollywood Argyles'], ['17', '"My Heart Has a Mind of Its Own"', 'Connie Francis'], ['18', '"Sweet Nothin\'s"', 'Brenda Lee'], ['19', '"Itsy Bitsy Teenie Weenie Yellow Polka Dot Bikini"', 'Brian Hyland'], ['20', '"Only the Lonely"', 'Roy Orbison'], ['21', '"Where or When"', 'Dion and the Belmo

**Part 2: Now that we have our song/artist data neatly stored in lists for each decade, we can retrieve lyrics for each song from the genius API.**

In [43]:
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
def get_clean_words(text, exclude_words=[]):
    """Given some text, return a list of clean words.
    
    Input
    -----
        text : str
        exclude_words : list
            Words to exclude (e.g. characters own name)
    
    Output
    ------
        words : list
        
    Example
    -------
        >>> my_text = get_clean_text('Iron Man', 'heroes')
        >>> my_words = get_clean_words(my_text, exclude_words=['Iron', 'Man'])
    """

    # Extract words
    words = word_tokenize(text)
    
    # Add exclude words to words to give it same treatment
    words += exclude_words
    
    # Convert to lower case
    words = [w.lower() for w in words]
    
    # Clear punctuation
    table = str.maketrans('', '', string.punctuation)
    words = [w.translate(table) for w in words]
    
    # Seperate words and exclude words
    if len(exclude_words) > 0:
        exclude_words = words[-len(exclude_words):]
        words = words[:-len(exclude_words)]
    
    # Remove non-alphabetic words
    words = [w for w in words if w.isalpha()]
    
    # Remove stop words
    words = [w for w in words if not w in stop_words | set(exclude_words)]
    
    # Remove single letter words 
    words = [w for w in words if len(w) > 1]

    return words

In [46]:
import lyricsgenius

# create an instance of the Genius API (using given access token)
genius = lyricsgenius.Genius("5b5PhvCQxiBbHpKVRH4V7_O4bb-o6UNhUkDg35VvSxmFa-wdoGTsgCinzcrLiIn1")



# loop through each song in the list and search for its lyrics
"""
for songInfo in list1960s:
    title = songInfo[1]
    artist = songInfo[2]
    
    song = genius.search_song(title, artist)
    
"""

test = scrapeBillboardYear(2000)

song = genius.search_song(test[0][1], test[0][2])

lyrics = get_clean_words(song.lyrics)
print(lyrics)
print()
print(song.lyrics)


Searching for ""Breathe"" by Faith Hill...
Done.
['contributorsbreathe', 'lyrics', 'intro', 'feel', 'magic', 'floating', 'air', 'gets', 'way', 'watch', 'sunlight', 'dance', 'across', 'face', 'ive', 'never', 'swept', 'away', 'verse', 'thoughts', 'seem', 'settle', 'breeze', 'lying', 'wrapped', 'arms', 'whole', 'world', 'fades', 'away', 'thing', 'hear', 'beating', 'heart', 'chorus', 'feel', 'breathe', 'washing', 'suddenly', 'melting', 'nothing', 'left', 'prove', 'baby', 'need', 'caught', 'touch', 'slow', 'steady', 'rush', 'baby', 'nt', 'way', 'love', 'supposed', 'feel', 'breathe', 'breathe', 'verse', 'way', 'know', 'heart', 'waking', 'walls', 'come', 'tumbling', 'closer', 'ever', 'felt', 'know', 'know', 'need', 'words', 'right', 'might', 'also', 'like', 'chorus', 'feel', 'breathe', 'washing', 'suddenly', 'melting', 'nothing', 'left', 'prove', 'baby', 'need', 'caught', 'touch', 'slow', 'steady', 'rush', 'baby', 'nt', 'way', 'love', 'supposed', 'feel', 'breathe', 'breathe', 'instrumental', 