# Scrape azlyrics.com

This notebook contains work that scrapes azlyrics.com for all of the lyrics from a certain artist. The functions created require only a url of the artist. In this notebook I scraped Beatles lyrics, which were analyzed in the lyric modelling and exploration notebook in this repository.

Last run with Python 3 on February 22, 2017

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
import re
import difflib


In [7]:
beatles_url = 'http://www.azlyrics.com/b/beatles.html'

base_url = 'http://www.azlyrics.com'

headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'},
           {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}
          ]


In [8]:
# Beautiful Soup Functions

def get_bs(url,header_index):
    return BeautifulSoup(requests.get(url,headers=headers[header_index]).text, "html.parser")


def album_song_urls(songs,albums):
    albumDict = {}
    albumIndex=0
    albumList = []
    urlList = []
    for i in range(len(songs)):
        try:
            albumList.append([songs[i]['href'][2:],songs[i].text])
            urlList.append(songs[i]['href'][2:])
        except KeyError:
            try:
                albumDict[album_year[albumIndex]]=albumList
                albumIndex += 1
                albumList = []
                pass
            except IndexError:
                if albumIndex==len(album_year):
                    break
    return urlList


def get_lyrics(url_tail, base_url=base_url):
    url = base_url + url_tail
    soup = get_bs(url)
    title = soup.find(class_='ringtone').nextSibling.nextSibling.text[1:-1]
    try:
        writers = soup.findAll(text=re.compile('^Writer\(s\):'))[0][11:].split(",")
    except IndexError:
        writers = ['NA']
    lyrics = soup.find(class_="col-xs-12 col-lg-8 text-center").find_all('div')[6].text.split("\n")
    return [title, writers, lyrics]


In [10]:
# Getting the urls of each song

beatles_soup = get_bs(beatles_url,0)

beatles_urls = album_songs_urls(songs,album_year)
#print(beatles_urls)


In [14]:
# Acquiring lyrics from each song page

beatles_lyrics = []
for song in beatles_urls:
    beatles_lyrics.append(get_lyrics(song))


In [None]:
# Lyric Parsing

def parse_stanzas_lines(artist_lyrics):
    lines = []
    for song in artist_lyrics:
        stz_num = 1
        stz_line_num = 1
        song_line_num = 1
        for line in song[2][2:]:
            if line:
                lines.append([song[0],
                              'stanza_{}'.format(stz_num),
                              'stanza_line_{}'.format(stz_line_num),
                              'song_line_{}'.format(song_line_num),
                              re.sub(r'[^\w ]', ' ', line.lower()).strip()])
                stz_line_num += 1
                song_line_num += 1
            else:
                stz_num += 1
                stz_line_num = 1
    return lines


In [11]:
# Parsing beatles lyrics to put in Pandas DataFrame...

beatles_lyrics_parsed = parse_stanzas_lines(beatles_lyrics)


lyrics_df_cols = ['song','stanza','stanza_line','song_line','lyric']
df_lyrics = pd.DataFrame(beatles_parsed,columns=lyrics_df_cols)


df_lyrics.head()

Unnamed: 0,song,stanza,stanza_line,song_line,lyric
0,I Saw Her Standing There,stanza_1,stanza_line_1,song_line_1,1 2 3 4
1,I Saw Her Standing There,stanza_2,stanza_line_1,song_line_2,well she was just 17
2,I Saw Her Standing There,stanza_2,stanza_line_2,song_line_3,you know what i mean
3,I Saw Her Standing There,stanza_2,stanza_line_3,song_line_4,and the way she looked was way beyond compare
4,I Saw Her Standing There,stanza_2,stanza_line_4,song_line_5,so how could i dance with another ooh


### Adding csv of songwriter information

In [13]:
df_writers = pd.read_csv('beatles_songwriters.csv',encoding = 'iso-8859-1')

df_writers.head()

Unnamed: 0,album,release_date,song,writer,lead_singer
0,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney
1,Please Please Please Me,March 22 1963,Misery,Lennon/McCartney,Lennon/McCartney
2,Please Please Please Me,March 22 1963,Anna (Go to Him),Alexander,Lennon
3,Please Please Please Me,March 22 1963,Chains,Goffin/King,Harrison
4,Please Please Please Me,March 22 1963,Boys,Dixon/Farrell,Starr


In [15]:
### Fuzzy Match both dataframes

df_lyrics['song'] = df_lyrics['song'].apply(lambda x: difflib.get_close_matches(x, df_writers['song'])[0])
df = df_writers.merge(df_lyrics)
df = df.loc[df["lead_singer"]!='Instrumental']


df.head()

Unnamed: 0,album,release_date,song,writer,lead_singer,stanza,stanza_line,song_line,lyric
0,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney,stanza_1,stanza_line_1,song_line_1,1 2 3 4
1,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney,stanza_2,stanza_line_1,song_line_2,well she was just 17
2,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney,stanza_2,stanza_line_2,song_line_3,you know what i mean
3,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney,stanza_2,stanza_line_3,song_line_4,and the way she looked was way beyond compare
4,Please Please Please Me,March 22 1963,I Saw Her Standing There,McCartney,McCartney,stanza_2,stanza_line_4,song_line_5,so how could i dance with another ooh


In [16]:
# Pickling

with open('df_beatles.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)

#with open("df_beatles.pkl", 'rb') as picklefile: 
#    df = pickle.load(picklefile)

In [17]:
# Other functions not used (but that could be helpful)

# Below (some of) them is a commented example of how they could be used




def artist_albums(soup):
    albums = []
    works = soup.find(id='listAlbum').find_all(class_='album')
    for work in works:
        if work.text[:5]=='album':
            albums.append(work.text[7:])
    return albums

#beatles_albums = artist_albums(beatles_soup)
#beatles_albums = beatles_albums[:-1]
#print(beatles_albums)



def artist_songs(soup):
    songs = []
    hits = soup.find(id='listAlbum').find_all('a')
    for hit in hits:
        if hit.has_attr('href'):
            songs.append(hit)
    return songs

#beatles_songs = artist_songs(beatles_soup)
#print(beatles_songs)



def parse_lines(artist_lyrics):
    lines = []
    for song in artist_lyrics:
        line_num = 1
        for line in song[2][2:]:
            if line:
                lines.append([song[0]+' - line {}'.format(line_num),
                              re.sub(r'[^\w ]', '', line.lower()).strip()])
                line_num += 1
    return lines

#beatles_lyrics_parsed = parse_lines(beatles_lyrics)


def parse_stanzas(artist_lyrics):
    stanzas = []
    for song in artist_lyrics:
        stz_num = 1
        stz = ''
        for line in song[2][2:]:
            if not line:
                stanzas.append([song[0],
                                'stanza_{}'.format(stz_num),
                                re.sub(r'[^\w ]', ' ', stz.lower()).strip()])
                stz = ''
                stz_num += 1
            else:
                stz+=line+' '
    return stanzas

#beatles_lyrics_parsed = parse_stanzas(beatles_lyrics)
