In [1]:
import pandas as pd
import numpy as np
import requests
import urllib
import re
import string
from bs4 import BeautifulSoup

In [9]:
#Open the page with all albums.
main_page = 'https://www.azlyrics.com/a/atmosphere.html'
try:
    content = urllib.request.urlopen(main_page).read()
    main_soup = BeautifulSoup(content, 'html.parser')
except Exception as e:
    print('Error'+str(e))
#Pull the list of albums on the page and the associated IDs.
album_list = main_soup.find(id='listAlbum')
albums = [name.string.strip('\"') for name in album_list.findAll('b')]
album_ids = [number.get('id') for number in album_list.findAll(['a']) if len(number)==0]

In [23]:
#Grab the year for each album as well
year_re = re.compile('''\((\d{4})\)''')
years = re.findall(year_re, str(album_list))

In [25]:
#Create a dataframe containing each album and the ID.
albumdf = pd.DataFrame(data={'AlbumName': albums, 'AlbumID': album_ids, 'Year': years})
albumdf

Unnamed: 0,AlbumName,AlbumID,Year
0,Overcast!,7391,1997
1,Lucy Ford: The Atmosphere EPs,5001,2001
2,God Loves Ugly,5005,2002
3,Seven's Travels,3262,2003
4,Headshots: SE7EN,7592,2005
5,You Can't Imagine How Much Fun We're Having,5146,2005
6,Happy Clown Bad Dub 8 / Fun EP,7187,2006
7,Sad Clown Bad Summer #9,6298,2007
8,Sad Clown Bad Fall #10,7048,2007
9,Sad Clown Bad Winter #11,7050,2007


In [43]:
#Need to implement loop/vectorize (if possible) so it can be done on every album.
album_dicts = dict.fromkeys(albumdf['AlbumID'])
albumdf['AlbumID']

0      7391
1      5001
2      5005
3      3262
4      7592
5      5146
6      7187
7      6298
8      7048
9      7050
10     6710
11     6684
12     7049
13     7568
14    10118
15    15442
16    31840
17    43755
18    60625
Name: AlbumID, dtype: object

In [46]:
#Need to implement loop/vectorize (if possible) so it can be done on every album.
album_dicts = dict.fromkeys(albumdf['AlbumID'])
for i,j in enumerate(albumdf['AlbumID']):
    #Find the album.
    findalbum = album_list.findAll(class_='album')[i]
    #Initialize song list.
    songs = []
    #Find where the song titles begin.
    title = findalbum.findNext('a', text=True)
    #Loop through the list until reaching the next album, where there is an ID field.
    if i < len(albumdf)-1:
        while title.get('id') != albumdf['AlbumID'][i+1]:
            songs.append(title.text)
            title = title.findNext('a')
        album_dicts[j] = songs
    else:
        while title.text != 'Sleeping On The Bright Side':
            songs.append(title.text)
            title = title.findNext('a')
        album_dicts[j] = songs

Found: <div class="album">album: <b>"Overcast!"</b> (1997)</div>
Found: <div class="album">EP: <b>"Lucy Ford: The Atmosphere EPs"</b> (2001)</div>
Found: <div class="album">album: <b>"God Loves Ugly"</b> (2002)</div>
Found: <div class="album">album: <b>"Seven's Travels"</b> (2003)</div>
Found: <div class="album">compilation: <b>"Headshots: SE7EN"</b> (2005)</div>
Found: <div class="album">album: <b>"You Can't Imagine How Much Fun We're Having"</b> (2005)</div>
Found: <div class="album">EP: <b>"Happy Clown Bad Dub 8 / Fun EP"</b> (2006)</div>
Found: <div class="album">EP: <b>"Sad Clown Bad Summer #9"</b> (2007)</div>
Found: <div class="album">EP: <b>"Sad Clown Bad Fall #10"</b> (2007)</div>
Found: <div class="album">EP: <b>"Sad Clown Bad Winter #11"</b> (2007)</div>
Found: <div class="album">album: <b>"Strictly Leakage"</b> (2007)</div>
Found: <div class="album">album: <b>"When Life Gives You Lemons, You Paint That Shit Gold"</b> (2008)</div>
Found: <div class="album">EP: <b>"Sad Clown 

In [47]:
#Add song counts to the albumdf.
albumdf['Songs'] = albumdf['AlbumID'].map(len)

In [48]:
#Get list of all songs.
all_songs = [name for sublist in album_dicts.values() for name in sublist]
songdf = pd.DataFrame({'SongName': all_songs})
songdf['SongName']

0                                         1597
1                            Brief Description
2                               Current Status
3                                Complications
4                                      4:30 AM
5                                       Adjust
6                                         Clay
7                           Sound Is Vibration
8                                    Multiples
9                                    Scapegoat
10                                         WND
11                         Multiples (Reprise)
12                                    Caved In
13                       Cuando Limpia El Humo
14                                The Outernet
15                                      Primer
16                           Between The Lines
17                                  Like Today
18                         Tears For The Sheep
19                         Guns And Cigarettes
20            Don't Ever Fucking Question That
21           

In [49]:
#Function to clean up the song names.
def clean_names(string):
    pattern = re.compile('[\W_]+')
    string = pattern.sub('', string.lower())
    return string

In [50]:
#Function to match albums and songs.
album_id_values = albumdf['AlbumID'].tolist()
def matchAlbum(name):
    for k in album_id_values:
        if name in album_dicts[k]:
            return k

In [51]:
#Create a new column with the cleaned up names.
songdf['CleanedName'] = songdf['SongName'].map(clean_names)
#Create column of album IDs.
songdf['AlbumID'] = songdf['SongName'].map(matchAlbum)
#And another for the different song pages.
songdf['SongPage'] = songdf['CleanedName'].apply(
    lambda name: 'https://www.azlyrics.com/lyrics/atmosphere/'+name+'.html')
songdf

Unnamed: 0,SongName,CleanedName,AlbumID,SongPage
0,1597,1597,7391,https://www.azlyrics.com/lyrics/atmosphere/159...
1,Brief Description,briefdescription,7391,https://www.azlyrics.com/lyrics/atmosphere/bri...
2,Current Status,currentstatus,7391,https://www.azlyrics.com/lyrics/atmosphere/cur...
3,Complications,complications,7391,https://www.azlyrics.com/lyrics/atmosphere/com...
4,4:30 AM,430am,7391,https://www.azlyrics.com/lyrics/atmosphere/430...
5,Adjust,adjust,7391,https://www.azlyrics.com/lyrics/atmosphere/adj...
6,Clay,clay,7391,https://www.azlyrics.com/lyrics/atmosphere/cla...
7,Sound Is Vibration,soundisvibration,7391,https://www.azlyrics.com/lyrics/atmosphere/sou...
8,Multiples,multiples,7391,https://www.azlyrics.com/lyrics/atmosphere/mul...
9,Scapegoat,scapegoat,7391,https://www.azlyrics.com/lyrics/atmosphere/sca...


In [10]:
#Next, get a lexicon for every song by iterating through the pages

In [10]:
def lyricRetrieval(givenurl):
    songurl = givenurl
    try:
        song_content = urllib.request.urlopen(songurl).read()
        song_soup = BeautifulSoup(song_content, 'html.parser')
    except Exception as e:
        print('Error'+str(e))
    mess = song_soup.findAll('b', text=True)[1].findNext('div').text
    pattern = re.compile('[\W_]+')
    new_mess = pattern.sub(' ', mess.lower().replace("'", ""))
    return new_mess

In [12]:
#Faster way to do this below, it crashes the site/gets me IP blocked. Needed to slow down the requests.
#songdf['Lyrics'] = songdf['SongPage'].map(lyricRetrieval)

#Try this instead
import time

lyrics = []
for add in songdf['SongPage']:
    print(add)
    lyrics.append(lyricRetrieval(add))
    time.sleep(1+np.random.randn())

https://www.azlyrics.com/lyrics/atmosphere/likeafire.html
Error''


UnboundLocalError: local variable 'song_soup' referenced before assignment

Both methods above resulted in IP blocking from multiple locations (tried on several IP addresses). Went to Genius and their API with the data from here. Used the same songs and albums retrieved from AZLyrics.

In [56]:
#Use this to retrieve the relevant song page where the lyrics are
def songInfoRetrieve(title, artist_name):
    site_url = 'http://api.genius.com'
    headers = {'Authorization': 'Bearer ' + 'IfomKosHgz_77SarGL77k2HXxxSzTQs9Q4ru6GvcgYGCfYwkqhFNdSHwD1icClqh'}
    search_url = site_url + '/search'
    params = {'q': title + ' ' + artist_name}
    response = requests.get(search_url, params=params, headers=headers)

    json = response.json()
    remote_song_info = None

    for hit in json['response']['hits']:
        if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
            remote_song_info = hit
            break
    if remote_song_info:
        song_url = remote_song_info['result']['url']
        print(song_url)
        return (song_url, response)
    else:
        return ('No URL found', response)

In [57]:
#Use this to retrieve the lyrics and clean up any non-alphanumeric characters
pattern = re.compile('[\W_]+')
def scrapeSongUrl(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    messy_lyrics = html.find('div', class_='lyrics').get_text()
    #pattern = re.compile('[\W_]+')
    lyrics = pattern.sub(' ', messy_lyrics.lower().replace("\'", ""))
    return lyrics

In [58]:
#Ok, have the lyric retrieval working. Need to setup storage, and cleaning.

#Use earlier df to keep the songs and album ids together.
geniusdf = songdf[['SongName', 'AlbumID']]

In [59]:
#Create status code and url columns. Can check and remove any non-existent urls based on status codes.
geniusdf['SongResponse'] = geniusdf['SongName'].apply(lambda title: songInfoRetrieve(title, 'Atmosphere')[1].status_code)
geniusdf['SongURL'] = geniusdf['SongName'].apply(lambda title: songInfoRetrieve(title, 'Atmosphere')[0])

https://genius.com/Atmosphere-1597-lyrics
https://genius.com/Atmosphere-brief-description-lyrics
https://genius.com/Atmosphere-current-status-lyrics
https://genius.com/Atmosphere-complications-lyrics
https://genius.com/Atmosphere-4-30-am-lyrics
https://genius.com/Atmosphere-adjust-lyrics
https://genius.com/Atmosphere-clay-lyrics
https://genius.com/Atmosphere-sound-is-vibration-lyrics
https://genius.com/Atmosphere-multiples-lyrics
https://genius.com/Atmosphere-scapegoat-lyrics
https://genius.com/Atmosphere-wnd-lyrics
https://genius.com/Atmosphere-multiples-reprise-lyrics
https://genius.com/Atmosphere-caved-in-lyrics
https://genius.com/Atmosphere-cuando-limpia-el-humo-lyrics
https://genius.com/Atmosphere-the-outernet-lyrics
https://genius.com/Atmosphere-primer-lyrics
https://genius.com/Atmosphere-between-the-lines-lyrics
https://genius.com/Atmosphere-like-today-lyrics
https://genius.com/Atmosphere-tears-for-the-sheep-lyrics
https://genius.com/Atmosphere-guns-and-cigarettes-lyrics
https:/

https://genius.com/Atmosphere-in-her-music-box-lyrics
https://genius.com/Atmosphere-vanity-sick-lyrics
https://genius.com/Atmosphere-keyboard-lyrics
https://genius.com/Atmosphere-less-one-lyrics
https://genius.com/Atmosphere-good-daddy-lyrics
https://genius.com/Atmosphere-carry-me-home-lyrics
https://genius.com/Atmosphere-happy-mess-lyrics
https://genius.com/Atmosphere-not-another-day-lyrics
https://genius.com/Atmosphere-cmon-lyrics
https://genius.com/Atmosphere-they-always-know-lyrics
https://genius.com/Atmosphere-the-ropes-lyrics
https://genius.com/Atmosphere-white-noise-lyrics
https://genius.com/Atmosphere-feel-good-hit-of-the-summer-part-2-lyrics
https://genius.com/Atmosphere-mothers-day-lyrics
https://genius.com/Atmosphere-millie-fell-off-the-fire-escape-lyrics
https://genius.com/Atmosphere-until-the-nipples-gone-lyrics
https://genius.com/Atmosphere-the-major-leagues-lyrics
https://genius.com/Atmosphere-scalp-lyrics
https://genius.com/Atmosphere-the-best-day-lyrics
https://genius.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


https://genius.com/Atmosphere-1597-lyrics
https://genius.com/Atmosphere-brief-description-lyrics
https://genius.com/Atmosphere-current-status-lyrics
https://genius.com/Atmosphere-complications-lyrics
https://genius.com/Atmosphere-4-30-am-lyrics
https://genius.com/Atmosphere-adjust-lyrics
https://genius.com/Atmosphere-clay-lyrics
https://genius.com/Atmosphere-sound-is-vibration-lyrics
https://genius.com/Atmosphere-multiples-lyrics
https://genius.com/Atmosphere-scapegoat-lyrics
https://genius.com/Atmosphere-wnd-lyrics
https://genius.com/Atmosphere-multiples-reprise-lyrics
https://genius.com/Atmosphere-caved-in-lyrics
https://genius.com/Atmosphere-cuando-limpia-el-humo-lyrics
https://genius.com/Atmosphere-the-outernet-lyrics
https://genius.com/Atmosphere-primer-lyrics
https://genius.com/Atmosphere-between-the-lines-lyrics
https://genius.com/Atmosphere-like-today-lyrics
https://genius.com/Atmosphere-tears-for-the-sheep-lyrics
https://genius.com/Atmosphere-guns-and-cigarettes-lyrics
https:/

https://genius.com/Atmosphere-in-her-music-box-lyrics
https://genius.com/Atmosphere-vanity-sick-lyrics
https://genius.com/Atmosphere-keyboard-lyrics
https://genius.com/Atmosphere-less-one-lyrics
https://genius.com/Atmosphere-good-daddy-lyrics
https://genius.com/Atmosphere-carry-me-home-lyrics
https://genius.com/Atmosphere-happy-mess-lyrics
https://genius.com/Atmosphere-not-another-day-lyrics
https://genius.com/Atmosphere-cmon-lyrics
https://genius.com/Atmosphere-they-always-know-lyrics
https://genius.com/Atmosphere-the-ropes-lyrics
https://genius.com/Atmosphere-white-noise-lyrics
https://genius.com/Atmosphere-feel-good-hit-of-the-summer-part-2-lyrics
https://genius.com/Atmosphere-mothers-day-lyrics
https://genius.com/Atmosphere-millie-fell-off-the-fire-escape-lyrics
https://genius.com/Atmosphere-until-the-nipples-gone-lyrics
https://genius.com/Atmosphere-the-major-leagues-lyrics
https://genius.com/Atmosphere-scalp-lyrics
https://genius.com/Atmosphere-the-best-day-lyrics
https://genius.

In [65]:
#Check if any songs are missing from the API
geniusdf[geniusdf['SongURL'] == 'No URL found']

#Remove them from the dataset
geniusdf.drop(list(geniusdf[geniusdf['SongURL'] == 'No URL found'].index), inplace=True)
geniusdf.reset_index(drop=True, inplace=True)

In [66]:
#Every song will have a url, so we can now grab the data from each page.
geniusdf['SongLyrics'] = geniusdf['SongURL'].map(scrapeSongUrl)

In [67]:
geniusdf

Unnamed: 0,SongName,AlbumID,SongResponse,SongURL,SongLyrics
0,1597,7391,200,https://genius.com/Atmosphere-1597-lyrics,slug henceforth step within my psychoanalysis...
1,Brief Description,7391,200,https://genius.com/Atmosphere-brief-descriptio...,sample have you heard it sing along if you di...
2,Current Status,7391,200,https://genius.com/Atmosphere-current-status-l...,beyond now known as musab i peep rap city hig...
3,Complications,7391,200,https://genius.com/Atmosphere-complications-ly...,intro spawn spoken yo this is dedicated to th...
4,4:30 AM,7391,200,https://genius.com/Atmosphere-4-30-am-lyrics,el p so tootsie roll the fuck back to your se...
5,Adjust,7391,200,https://genius.com/Atmosphere-adjust-lyrics,verse 1 slug i woke up on the wrong side of m...
6,Clay,7391,200,https://genius.com/Atmosphere-clay-lyrics,verse 1 slug when i first landed the damage w...
7,Sound Is Vibration,7391,200,https://genius.com/Atmosphere-sound-is-vibrati...,slug im sparked waiting for the dark to hit c...
8,Multiples,7391,200,https://genius.com/Atmosphere-multiples-lyrics,sample lionel hampton right now we gonna get ...
9,Scapegoat,7391,200,https://genius.com/Atmosphere-scapegoat-lyrics,verse 1 slug its the caffeine the nicotine th...


In [68]:
#Save the data
geniusdf.to_csv('atmosphere_song_lyrics_2019', header=True)
albumdf.to_csv('atmosphere_album_data_2019', header=True)

Successfully retrieved all the songs and the lyrics from Genius.com. Now to explore and see how we can apply NLP to the data.