#SongLyrics.com Lyrics Scraper

**By Josiah Nielsen**

**For: Collecting and Analyzing Big Data**

This notebook contains code for scraping lyrics from multiple genres from SongLyrics.com using Beautiful Soup. 

Despite attempting various methods to avoid being blocked by the server, the most amount of songs I was able to scrape was ~4000. Due to this, I use a supplementary dataset from Kaggle for training my classification models (in the second notebook). 

In [None]:
#Import Dependencies
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib

In [None]:
#Make request to our desired url 
page = requests.get("http://www.songlyrics.com/musicgenres.php")

In [None]:
#Beautiful Soup HTML parser
soup = BeautifulSoup(page.content,'html.parser')

In [None]:
#Generate suffixes for accessing the webpage for each genre 
links = []
for i in soup.find_all(class_='td-item td-text-big'):
    links.append(tuple([i.get_text(),i.find('a').get('href')]))

In [None]:
#Select which genres we want to generate website links for 
genres = ['Folk','Rock','R&B;','Hip Hop/Rap','Pop','Jazz']
links

[('Acoustic', 'acoustic-lyrics.php'),
 ('Alternative', 'alternative-lyrics.php'),
 ('Folk', 'folk-lyrics.php'),
 ('Country', 'country-music-lyrics.php'),
 ('Rock', 'rock-lyrics.php'),
 ('Christian', 'christian-lyrics.php'),
 ('R&B;', 'r-and-b-lyrics.php'),
 ('Hip Hop/Rap', 'hip-hop-rap-lyrics.php'),
 ('Dance', 'dance-lyrics.php'),
 ('Latin', 'latin-lyrics.php'),
 ('Blues', 'blues-lyrics.php'),
 ('Electronic', 'electronic-lyrics.php'),
 ('Funk', 'funk-lyrics.php'),
 ('Jazz', 'jazz-lyrics.php'),
 ('Pop', 'pop-lyrics.php'),
 ('Soul', 'soul-lyrics.php'),
 ('Soundtrack', 'soundtrack-lyrics.php'),
 ('Adult Contemporary', 'adult-contemporary-lyrics.php'),
 ('Classical', 'classical-lyrics.php'),
 ('Reggae', 'reggae-lyrics.php'),
 ('World', 'world-lyrics.php'),
 ('New Age', 'new-age-lyrics.php'),
 ('African', 'african-lyrics.php'),
 ('Ska', 'ska-lyrics.php'),
 ('Avant-Garde', 'avant-garde-lyrics.php'),
 ("Children's Music", 'childrens-music-lyrics.php'),
 ('Holiday', 'holiday-lyrics.php'),
 ('C

In [None]:
#Generate the link suffixes for each genre on songlyrics.com
genre_links = []
for i in links:
    if(i[0] in genres):
        new_genre_link.append(i)

In [None]:
genre_links = genre_links[:-3]

In [None]:
genre_links

[('Folk', 'folk-lyrics.php'),
 ('Rock', 'rock-lyrics.php'),
 ('R&B;', 'r-and-b-lyrics.php'),
 ('Hip Hop/Rap', 'hip-hop-rap-lyrics.php'),
 ('Jazz', 'jazz-lyrics.php'),
 ('Pop', 'pop-lyrics.php')]

In [None]:
#Add Link for R&B Songs
new_genre_link[2] = list(new_genre_link[2])
new_genre_link[2][0] = 'R&B'
new_genre_link[2] = tuple(new_genre_link[2])
new_genre_link[3] = list(new_genre_link[3])
new_genre_link[3][0] = 'Hip-Hop'
new_genre_link[3] = tuple(new_genre_link[3])

In [None]:
genre_links

[('Folk', 'folk-lyrics.php'),
 ('Rock', 'rock-lyrics.php'),
 ('R&B', 'r-and-b-lyrics.php'),
 ('Hip-Hop', 'hip-hop-rap-lyrics.php'),
 ('Jazz', 'jazz-lyrics.php'),
 ('Pop', 'pop-lyrics.php')]

In [None]:
#Generate links for accessing top albums from each genre
album_links = []
flag = 0
for table in soup.find_all(class_='tracklist'):
    if(flag==1):
        for i in table.find_all(class_="td-item td-last"):
            print (i)
            print ('\n')
            album_link.append(tuple(['pop',i.find_all('span')[-1].get_text(),i.find('a').get('title'),i.find('a').get('href')]))
    flag += 1

<td class="td-item td-last"><h3><a href="http://www.songlyrics.com/ed-sheeran/-divide/" title="÷ (Divide) Album">÷ (Divide) <span>Album</span></a></h3><span>Ed Sheeran</span></td>


<td class="td-item td-last"><h3><a href="http://www.songlyrics.com/bruno-mars/24k-magic/" title="24K Magic Album">24K Magic <span>Album</span></a></h3><span>Bruno Mars</span></td>


<td class="td-item td-last"><h3><a href="http://www.songlyrics.com/soundtrack/beauty-and-the-beast-ost/" title="Beauty and the Beast OST Album">Beauty and the Beast OST <span>Album</span></a></h3><span>Soundtrack</span></td>


<td class="td-item td-last"><h3><a href="http://www.songlyrics.com/the-shins/heartworms/" title="Heartworms Album">Heartworms <span>Album</span></a></h3><span>The Shins</span></td>


<td class="td-item td-last"><h3><a href="http://www.songlyrics.com/aaron-watson/vaquero/" title="Vaquero Album">Vaquero <span>Album</span></a></h3><span>Aaron Watson</span></td>


<td class="td-item td-last"><h3><a href="http:

In [None]:
album_links

[('pop',
  'Ed Sheeran',
  '÷ (Divide) Album',
  'http://www.songlyrics.com/ed-sheeran/-divide/'),
 ('pop',
  'Bruno Mars',
  '24K Magic Album',
  'http://www.songlyrics.com/bruno-mars/24k-magic/'),
 ('pop',
  'Soundtrack',
  'Beauty and the Beast OST Album',
  'http://www.songlyrics.com/soundtrack/beauty-and-the-beast-ost/'),
 ('pop',
  'The Shins',
  'Heartworms Album',
  'http://www.songlyrics.com/the-shins/heartworms/'),
 ('pop',
  'Aaron Watson',
  'Vaquero Album',
  'http://www.songlyrics.com/aaron-watson/vaquero/'),
 ('pop',
  'Jus Jack',
  'Stargazing Album',
  'http://www.songlyrics.com/jus-jack/stargazing/'),
 ('pop',
  'Justin Hurwitz',
  'La La Land (Original Motion Picture Score) Album',
  'http://www.songlyrics.com/justin-hurwitz/la-la-land-original-motion-picture-score/'),
 ('pop',
  'Chris Stapleton ',
  'Traveller Album',
  'http://www.songlyrics.com/chris-stapleton/traveller/'),
 ('pop',
  'J.J. Hairston',
  'You Deserve It Album',
  'http://www.songlyrics.com/j-j-hai

In [None]:
#Define function that compiles all of the scraping steps into one
def lyrics_scraper():
    song_matrix = []
    for i in new_genre_link:
        print ()
        genre_page = requests.get('http://www.songlyrics.com/'+i[1])
        soup = BeautifulSoup(genre_page.content,'html.parser')
        album_link = []
        flag = 0
        for table in soup.find_all(class_='tracklist'):
            if(flag==1):
                for j in table.find_all(class_="td-item td-last"):
                    album_link.append(tuple([i[0],j.find_all('span')[-1].get_text(),j.find('a').get('title'),j.find('a').get('href')]))
            flag += 1
        for k in album_link:
            req = requests.get(k[:][3])
            pop = BeautifulSoup(req.content,'html.parser')
            table = pop.find(class_='tracklist')
            song_link = []
            for l in table.find_all('tr'):
                song_link.append(tuple([l.find('a').get('title'),l.find('a').get('href')]))
            for m in song_link:
                if(m[:][1][0]=='/'):
                    lyrics = requests.get('http://www.songlyrics.com/'+m[:][1])
                else:
                    lyrics = requests.get(m[:][1])
                lyr = BeautifulSoup(lyrics.content,'html.parser')
                lyric = lyr.find(id='songLyricsDiv').get_text()
                if(len(lyric)>300):
                    row = []
                    row.append(k[0])
                    row.append(k[:][1])
                    row.append(k[:][2].split('Album')[0])
                    row.append(m[:][0].split('Lyrics')[0])
                    row.append(lyric)
                    song_matrix.append(row)
    return song_matrix

In [None]:
#Generate matrix of scraped lyrics
matrix = lyrics_scraper()

In [None]:
len(matrix)

4221

In [None]:
#Convert matrix into a pandas dataframe for further manipulation
data = pd.DataFrame(matrix,columns=['genre','artist','album','song','lyrics'])

In [None]:
#Drop any duplicate songs in the data
data = data.drop_duplicates('song')

In [None]:
data.groupby('genre').count()

Unnamed: 0_level_0,artist,album,song,lyrics
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Folk,478,478,478,478
Hip-Hop,118,118,118,118
Jazz,1583,1583,1583,1583
Pop,66,66,66,66
R&B,1135,1135,1135,1135
Rock,121,121,121,121


In [None]:
data

Unnamed: 0,genre,artist,album,song,lyrics
0,Folk,Regina Spektor,Far,Machine,"My eyes are bifocal, my hands are sub jointed\..."
1,Folk,Regina Spektor,Far,Eet,It's like forgetting the words to your favorit...
2,Folk,Regina Spektor,Far,"The Flowers - live, begin to hope tour",The flowers you gave me are rotting\nAnd still...
3,Folk,Regina Spektor,Far,Man Of A Thousand Faces,The man of a thousand faces\nSits down at the ...
4,Folk,Regina Spektor,Far,Man of a Thousand Faces,The man of a thousand faces\nSits down at the ...
5,Folk,Regina Spektor,Far,Blue Lips,"He stumbled into faith and thought, God this i..."
6,Folk,Regina Spektor,Far,Human Of The Year,"Hello, hello, calling a Carl Prejektorinski\nT..."
7,Folk,Regina Spektor,Far,Two Birds,Two birds on a wire\nOne tries to fly away and...
8,Folk,Regina Spektor,Far,Dance Anthem Of The 80s,You are so sweet\nDancing to the beat\n\nThere...
9,Folk,Regina Spektor,Far,Wallet,"I found a wallet, I found a wallet\nInside wer..."


In [None]:
data.to_csv("Scraped_Lyrics.csv")