# Final Project - Lainie Cederholm, Will Novak, Ian Pompliano

**Part 1: Scrape top 50 songs of each year (1960 to present) from Billboard Year-End Hot 100 singles (Wikipedia)**

In [45]:
import requests
from bs4 import BeautifulSoup

# function to scrape top 50 songs for a given year
def scrapeBillboardYear(year):
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    response = requests.get(url)
    
    # takes HTML content of web-page and parses it using parser provided by BeautifulSoup 
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # takes first table from given page (the table we are interested in)
    tables = soup.find_all('table', class_='wikitable')
    songsTable = tables[0]
    
    # excludes first row containing headers
    rows = songsTable.find_all('tr')[1:]
    
    # initialize list of top songs
    topSongs = []
    
    # iterates through first 50 songs and appends song list
    for row in rows[:5]:  
            columns = row.find_all(['td', 'th'])
            songInfo = [col.get_text(strip=False) for col in columns]
            topSongs.append(songInfo)
            
    return topSongs

In [46]:
# create lists that will hold all song data for each decade
list1960s = []
list1970s = []
list1980s = []
list1990s = []
list2000s = []
list2010s = []

# append respective lists with song/artist data 
for year in range(1960, 1970):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1960s.append(data)
for year in range(1970, 1980):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1970s.append(data)
for year in range(1980, 1990):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1980s.append(data)
for year in range(1990, 2000):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list1990s.append(data)
for year in range(2000, 2010):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list2000s.append(data)
for year in range(2010, 2020):
    yearData = scrapeBillboardYear(year)
    for data in yearData:
        list2010s.append(data)

In [47]:
print(list2000s)

[['1', '"Breathe"', 'Faith Hill\n'], ['2', '"Smooth"', 'Santana featuring Rob Thomas\n'], ['3', '"Maria Maria"', 'Santana featuring The Product G&B\n'], ['4', '"I Wanna Know"', 'Joe\n'], ['5', '"Everything You Want"', 'Vertical Horizon\n'], ['1', '"Hanging by a Moment"', 'Lifehouse\n'], ['2', '"Fallin\'"', 'Alicia Keys\n'], ['3', '"All for You"', 'Janet Jackson\n'], ['4', '"Drops of Jupiter (Tell Me)"', 'Train\n'], ['5', '"I\'m Real (Murder Remix)"', 'Jennifer Lopez featuring Ja Rule\n'], ['1', '"How You Remind Me"', 'Nickelback\n'], ['2', '"Foolish"', 'Ashanti\n'], ['3', '"Hot in Herre"', 'Nelly\n'], ['4', '"Dilemma"', 'Nelly featuring Kelly Rowland\n'], ['5', '"Wherever You Will Go"', 'The Calling\n'], ['1', '"In da Club"', '50 Cent\n'], ['2', '"Ignition"', 'R. Kelly\n'], ['3', '"Get Busy"', 'Sean Paul\n'], ['4', '"Crazy in Love"', 'Beyoncé featuring Jay-Z\n'], ['5', '"When I\'m Gone"', '3 Doors Down\n'], ['1', '"Yeah!"', 'Usher featuring Lil Jon and Ludacris\n'], ['2', '"Burn"', 'Us

**Part 2: Now that we have our song/artist data neatly stored in lists for each decade, we can retrieve lyrics for each song from the genius API.**

In [48]:
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import re

def clean_string(text):
    # Remove the first line
    cleaned_text = '\n'.join(text.split('\n')[1:])
    
    # Remove characters within brackets
    cleaned_text = re.sub(r'\[.*?\]', '', cleaned_text)
    
    return cleaned_text

In [51]:
import lyricsgenius

# create an instance of the Genius API (using given access token)
genius = lyricsgenius.Genius("5b5PhvCQxiBbHpKVRH4V7_O4bb-o6UNhUkDg35VvSxmFa-wdoGTsgCinzcrLiIn1", timeout=30)

string1960s = ""
string1970s = ""
string1980s = ""
string1990s = ""
string2000s = ""
string2010s = ""

for i in range(len(list1960s)):
    if len(list1960s[i]) != 3:
        list1960s[i].append(list1960s[i-1][2])
    song = genius.search_song(list1960s[i][1], list1960s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string1960s += lyricsClean
    
for i in range(len(list1970s)):
    if len(list1970s[i]) != 3:
        list1970s[i].append(list1970s[i-1][2])
    song = genius.search_song(list1970s[i][1], list1970s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string1970s += lyricsClean
    
for i in range(len(list1980s)):
    if len(list1980s[i]) != 3:
        list1980s[i].append(list1980s[i-1][2])
    song = genius.search_song(list1980s[i][1], list1980s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string1980s += lyricsClean
    
for i in range(len(list1990s)):
    if len(list1990s[i]) != 3:
        list1990s[i].append(list1990s[i-1][2])
    song = genius.search_song(list1990s[i][1], list1990s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string1990s += lyricsClean
    
for i in range(len(list2000s)):
    if len(list2000s[i]) != 3:
        list2000s[i].append(list2000s[i-1][2])
    song = genius.search_song(list2000s[i][1], list2000s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string2000s += lyricsClean
    
for i in range(len(list2010s)):
    if len(list2010s[i]) != 3:
        list2010s[i].append(list2010s[i-1][2])
    song = genius.search_song(list2010s[i][1], list2010s[i][2])
    try:
        lyricsClean = clean_string(song.lyrics)
    except:
        continue
    string2010s += lyricsClean

Searching for ""Theme from A Summer Place"" by Percy Faith
...
Done.
Searching for ""He'll Have to Go"" by Jim Reeves
...
Done.
Searching for ""Cathy's Clown"" by The Everly Brothers
...
Done.
Searching for ""Running Bear"" by Johnny Preston
...
Done.
Searching for ""Teen Angel"" by Mark Dinning
...
Done.
Searching for ""Tossin' and Turnin'"" by Bobby Lewis
...
Done.
Searching for ""I Fall to Pieces"" by Patsy Cline
...
Done.
Searching for ""Michael"" by The Highwaymen
...
Done.
Searching for ""Crying"" by Roy Orbison
...
Done.
Searching for ""Runaway"" by Del Shannon
...
Done.
Searching for ""Stranger on the Shore"" by Acker Bilk
...
Specified song does not contain lyrics. Rejecting.
Searching for ""I Can't Stop Loving You"" by Ray Charles
...
Done.
Searching for ""Mashed Potato Time"" by Dee Dee Sharp
...
Done.
Searching for ""Roses Are Red (My Love)"" by Bobby Vinton
...
Done.
Searching for ""The Stripper"" by David Rose
...
Specified song does not contain lyrics. Rejecting.
Searchi

Done.
Searching for ""Careless Whisper"" by Wham!
...
Done.
Searching for ""Like a Virgin"" by Madonna
...
Done.
Searching for ""Wake Me Up Before You Go-Go"" by Wham!
...
Done.
Searching for ""I Want to Know What Love Is"" by Foreigner
...
Done.
Searching for ""I Feel for You"" by Chaka Khan
...
Done.
Searching for ""That's What Friends Are For"" by Dionne and Friends (Dionne Warwick, Gladys Knight, Elton John and Stevie Wonder)
...
Done.
Searching for ""Say You, Say Me"" by Lionel Richie
...
Done.
Searching for ""I Miss You"" by Klymaxx
...
Done.
Searching for ""On My Own"" by Patti LaBelle and Michael McDonald
...
Done.
Searching for ""Broken Wings"" by Mr. Mister
...
Done.
Searching for ""Walk Like An Egyptian"" by The Bangles
...
Done.
Searching for ""Alone"" by Heart
...
Done.
Searching for ""Shake You Down"" by Gregory Abbott
...
Done.
Searching for ""I Wanna Dance with Somebody (Who Loves Me)"" by Whitney Houston
...
Done.
Searching for ""Nothing's Gonna Stop Us Now"" by Starsh

Done.
Searching for ""California Gurls"" by Katy Perry featuring Snoop Dogg
...
Done.
Searching for ""OMG"" by Usher featuring will.i.am
...
Done.
Searching for ""Rolling in the Deep"" by Adele
...
Done.
Searching for ""Party Rock Anthem"" by LMFAO featuring Lauren Bennett and GoonRock
...
Done.
Searching for ""Firework"" by Katy Perry
...
Done.
Searching for ""E.T."" by Katy Perry featuring Kanye West
...
Done.
Searching for ""Give Me Everything"" by Pitbull featuring Ne-Yo, Afrojack and Nayer
...
Done.
Searching for ""Somebody That I Used to Know"" by Gotye featuring Kimbra
...
Done.
Searching for ""Call Me Maybe"" by Carly Rae Jepsen
...
Done.
Searching for ""We Are Young"" by Fun featuring Janelle Monáe
...
Done.
Searching for ""Payphone"" by Maroon 5 featuring Wiz Khalifa
...
Done.
Searching for ""Lights"" by Ellie Goulding
...
Done.
Searching for ""Thrift Shop"" by Macklemore & Ryan Lewis featuring Wanz
...
Done.
Searching for ""Blurred Lines"" by Robin Thicke featuring T.I. and 

In [54]:
# write each decade string to a text file
stringsByDecade = {
    "string1960s": string1960s,
    "string1970s": string1970s,
    "string1980s": string1980s,
    "string1990s": string1990s,
    "string2000s": string2000s,
    "string2010s": string2010s
}

for decade, text in stringsByDecade.items():
    file_name = f"{decade}.txt"
    with open(file_name, "w") as file:
        file.write(text) 