# Create Lyrics Database
This file articulates the way the database of lyrics was compiled. All lyrics were scraped from [AZLyrics.com](www.azlyrics.com), while the artists and specific songs selected came from [IMDB](https://www.imdb.com/list/ls058480497/) and [Wikipedia](https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1960).

## Import Libraries & Define Critical Functions
The three primary functions used in this file were:
- **get_soup:** Default function for getting html data from a website and preparing it for parsing.
- **get_artist_songs:** Given an artist name, this function attempts to retrieve the list of songs with the corresponding URL's for lyrics.
- **get_lyrics:** Uses links for songs to get lyrics.

In [21]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import os
import time
import json

In [2]:
def get_soup(url):
    html_page = requests.get(url) #Make a get request to retrieve the page
    soup = BeautifulSoup(html_page.content, 'html.parser') #Pass the page contents to beautiful soup for parsing
    return soup


def get_artist_songs(artist):
    # Remove non-alphanumeric characters from artist name
    clean_name = re.sub(r'\W+', '', artist.lower())
    
    # Create directory for saving song data
    new_dir = f'lyrics/{artist}'
    try:
        os.makedirs(new_dir)
    except:
        pass
    
    # Scrape artist page for song names and links
    url = f'https://www.azlyrics.com/{clean_name[0]}/{clean_name}.html'
    soup = get_soup(url)
    
    song_tags = soup.find_all('div', class_='listalbum-item')
    
    songs = []
    song_links = []
    for song in song_tags:
        for a in song.find_all('a', href=True):
            song_link = f"https://www.azlyrics.com{a['href'][2:]}"
            if '+' not in song_link:
                songs.append(song.text)
                song_links.append(song_link)
            
    return songs, song_links


def get_lyrics(song, link):
    soup = get_soup(link)
    lyrics = soup.find_all('div', class_=None)[1].text.strip()
    return lyrics

## Get List of Top 100 Artists from IMDB

In [3]:
# Scrape IMDB
url = 'https://www.imdb.com/list/ls058480497/'
soup = get_soup(url)

In [4]:
# Extract list of artist names to list
artists = []
for item in soup.find_all("h3", class_="lister-item-header"):
    txt = item.text.split('. ')[1].strip().replace('.', '')
    if txt[:4] == 'The ':
        txt = txt[4:]
    artists.append(txt)
    
artists[:5]

['Hank Williams',
 'Frank Sinatra',
 'Bo Diddley',
 'Ray Charles',
 'Little Richard']

## Format Artist Names for AZLyrics Search
The code below cycles through each artist and each song, saving each set of song lyrics to its own text file. The lyrics folder contains folders for each artist, which contains a random selection of song lyrics for each artist (approx 15% of all artist songs).

In [5]:
for artist in artists:
    # Get song names and links for given artist
    songs, song_links = get_artist_songs(artist)
    
    # Pause for a few seconds to not overwhelm azlyrics server
    time.sleep(np.random.randint(2, 8) + np.random.random())
    
    for song, link in zip(songs, song_links):
        # Select random number to determine whether or not to get songs for given song
        n = np.random.random()
        if n >.85 and len(os.listdir(f'lyrics/{artist}')) < 20:
            
            # Write lyrics to file if not error
            try:
                lyrics = get_lyrics(song, link)
                with open(f"lyrics/{artist}/{song.lower().replace(' ', '_')}.txt", 'w') as f:
                    f.write(lyrics)
            except:
                pass
            
            # Pause before checking next song/artist
            time.sleep(np.random.randint(2, 8) + np.random.random())
            
print('Done.')

Done.


## Clean Up
In some cases, the artist name from IMDB does not match with AZLyrics formatting, so no data is gathered, or very limited data is gathered. The code below removes directory folders where artist data was too limited for practical use.

In [6]:
for artist in artists:
    try:
        if len(os.listdir(f'lyrics/{artist}')) < 2:
            os.rmdir(f'lyrics/{artist}')
    except:
        pass

## Get Top 10 Billboard Songs for Each Year Since 1960
The data in Wikipedia is not formatted the same way for each year, so code will have to be modified for different time periods.

### 1960 - 1981

In [43]:
# Create dictionary for storing song and artist data
billboard_dict = {
    'Rank': [],
    'Year': [],
    'Song': [],
    'Artist': []
}

# Get top 10 songs with artists for each year from wikipedia
for yr in range(1960, 1982):
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{yr}'
    soup = get_soup(url)
    
    # Song & artist are 2nd and 3rd columns in each row of table data
    raw = soup.find_all('td')[:30]
    data = [raw[i].text for i in range(len(raw)) if i%3!=0]

    # Iteratively append data to dictionary
    i=0
    for n in range(10):
        billboard_dict['Rank'].append(i+1)
        billboard_dict['Year'].append(yr)
        billboard_dict['Song'].append(data[n+i].replace('"', '').strip())
        billboard_dict['Artist'].append(data[n+i+1].replace('"', '').strip())
        i+=1
        
    # Pause before scraping next page
    time.sleep(5)

### 1982-2019 (excl. 2012-13)

In [49]:
# Get top 10 songs with artists for each year from wikipedia
for yr in [i for i in range(1982, 2020) if i not in [2012, 2013]]:
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{yr}'
    soup = get_soup(url)
    
    # Song & artist are 2nd and 3rd columns in each row of table data
    raw = soup.find_all('td')[:30]
    data = [raw[i].find_next('a').text for i in range(len(raw))]

    # Iteratively append data to dictionary
    for n in range(10):
        if n%2==0:
            billboard_dict['Rank'].append(n+1)
            billboard_dict['Year'].append(yr)
            billboard_dict['Song'].append(data[n].replace('"', '').strip())
        else:
            billboard_dict['Artist'].append(data[n].replace('"', '').strip())
        i+=1
        
    # Pause before scraping next page
    time.sleep(5)

# Convert to dataframe and save as CSV
billboard_df = pd.DataFrame.from_dict(billboard_dict).sort_values(['Rank', 'Year'])
print(f'Record Count: {len(billboard_df)}')
billboard_df.head()

Record Count: 400


Unnamed: 0,Rank,Year,Song,Artist
0,1,1960,Theme from A Summer Place,Percy Faith
10,1,1961,Tossin' and Turnin',Bobby Lewis
20,1,1962,Stranger on the Shore,Acker Bilk
30,1,1963,Surfin' U.S.A.,The Beach Boys
40,1,1964,I Want to Hold Your Hand,The Beatles


In [8]:
errors=0
for i in range(len(billboard_df)):
    t0 = time.time()
    Artist = billboard_df.Artist[i]
    Artist = Artist if Artist[:4] != 'The ' else Artist[4:]
    artist = re.sub(r'\W+', '', Artist.lower())
    
    Song = billboard_df.Song[i]
    song = re.sub(r'\W+', '', Song.lower())
    
    url = f"https://www.azlyrics.com/lyrics/{artist}/{song}.html"
    try:
        lyrics = get_lyrics(Song, url)
        if len(lyrics)>50:
            new_dir = f'lyrics/{Artist}'
            time.sleep(np.random.randint(2, 5) + np.random.random())
            try:
                os.makedirs(new_dir)
            except:
                pass

            with open(f"lyrics/{Artist}/{Song.lower().replace(' ', '_')}.txt", 'w') as f:
                        f.write(lyrics)
        
    except:
        errors+=1
        print('-----------Error-----------')
        if errors == 10:
            break
    
    j+=1
    t1 = time.time()
    print(f'{j}. {Artist}: {Song} ({t1-t0:.1f}): {url}')

print('Done')

Done


## Create JSON for Database Structure

In [24]:
db_dict = {}

for artist in os.listdir('lyrics/'):
    if artist != '.DS_Store':
        db_dict[artist]={'song_names': [], 'file_names': []}
        for file in os.listdir(f'lyrics/{artist}'):
            song_name = file.replace('.txt', '').replace('_', ' ').title()
            db_dict[artist]['file_names'].append(file)
            db_dict[artist]['song_names'].append(song_name)
            
with open("lyrics.json", "w") as outfile:  
    json.dump(db_dict, outfile)