# In this notebook I use my dictionary of folk artists mapped to songs in order to scrape the song lyrics for each song and artist.

I'll start by importing the required packages.

In [1]:
import requests
from time import sleep
import csv
import json
import re
import pandas as pd
import numpy as np

## Formatting, Editing, Filtering for Lyric Webscraping

In [2]:
# file containing artists - songs mapping
folk_songs_json = "Folk-Artists-Songs-Mapping.json"
folk_songs_dict = {}

with open(folk_songs_json) as file:
    folk_songs_dict = json.load(file)
    

In [3]:
# checking the keys (artist names)
folk_songs_dict.keys()

dict_keys(['John Denver', 'Kris Kristofferson', 'Leonard Cohen', 'Paul Simon', 'Cat Stevens (Yusuf)', 'Tracy Chapman', 'Van Morrison', 'James Blunt', 'John Prine', 'Gordon Lightfoot', 'The Tragically Hip', 'George Ezra', 'Woody Guthrie', 'k.d. lang', 'Nick Drake', 'Aimee Mann', 'Passenger', 'Arlo Guthrie', 'Bon Iver', 'Lianne La Havas', 'Bob Dylan', 'Joni Mitchell', 'Neil Young', 'James Taylor', 'Simon & Garfunkel', 'The Lumineers', 'Mumford & Sons', 'Fleet Foxes', 'The Tallest Man On Earth', 'The Head And The Heart', 'Beirut', 'Feist', 'The Decemberists', 'The Civil Wars', 'Band Of Horses', 'Damien Rice', 'Joan Baez', 'Suzanne Vega', 'Carole King', "Sinead O'Connor", 'Nina Simone', 'Judy Collins', 'Arcade Fire', 'The Shins', 'The Smiths', 'Sufjan Stevens', 'Broken Social Scene', 'Pixies', 'girl in red', 'St. Vincent', 'Florence + The Machine', 'Neutral Milk Hotel', 'The Kooks', 'Lord Huron', 'Phoebe Bridgers', 'Mac DeMarco', 'R.E.M.', 'The New Pornographers', 'Toro Y Moi', 'City And C

Now I have to reformat the artist names in the dictionary to the url appropriate names...

In [4]:
folk_artists_for_link = ['johndenver', 'kriskristofferson', 'cohen', 
                      'paulsimon', 'catstevens', 'tracychapman', 'vanmorrison', 
                      'jamesblunt', 'johnprine', 'gordonlightfoot', 'tragicallyhip', 
                      'georgeezra', 'woodyguthrie', 'kdlang', 'nickdrake', 'aimeemann', 
                      'passenger', 'arloguthrie', 'boniver', 'liannelahavas', 'dylan', 
                      'jonimitchell', 'young', 'jamestaylor', 'simongarfunkel', 
                      'lumineers', 'mumfordsons', 'fleetfoxes', 'tallestmanonearth', 
                      'headandtheheart', 'beirut', 'feist', 'decemberists', 'civilwars',
                      'bandofhorses', 'rice', 'joanbaez', 'vega', 'caroleking', 
                      "sineadoconnor", 'ninasimone', 'judycollins',
                      'arcadefire', 'shins', 'smiths', 'sufjanstevens', 'brokensocialscene', 
                      'pixies', 'girlinred', 'stvincent', 'florencethemachine', 'neutralmilkhotel',
                      'kooks', 'lordhuron', 'phoebebridgers', 'macdemarco', 'rem', 'newpornographers',
                      'toroymoi', 'cityandcolour', 'greatlakeswimmers', 'menitrust', 'national', 'tomodell',
                      'ohwonder', 'ofmontreal', 'wainwright', 'fatherjohnmisty']

  
# changing keys of dictionary to link formatted versions from the list above
folk_songs_dict = dict(zip(folk_artists_for_link, list(folk_songs_dict.values())))
folk_songs_dict.keys()

dict_keys(['johndenver', 'kriskristofferson', 'cohen', 'paulsimon', 'catstevens', 'tracychapman', 'vanmorrison', 'jamesblunt', 'johnprine', 'gordonlightfoot', 'tragicallyhip', 'georgeezra', 'woodyguthrie', 'kdlang', 'nickdrake', 'aimeemann', 'passenger', 'arloguthrie', 'boniver', 'liannelahavas', 'dylan', 'jonimitchell', 'young', 'jamestaylor', 'simongarfunkel', 'lumineers', 'mumfordsons', 'fleetfoxes', 'tallestmanonearth', 'headandtheheart', 'beirut', 'feist', 'decemberists', 'civilwars', 'bandofhorses', 'rice', 'joanbaez', 'vega', 'caroleking', 'sineadoconnor', 'ninasimone', 'judycollins', 'arcadefire', 'shins', 'smiths', 'sufjanstevens', 'brokensocialscene', 'pixies', 'girlinred', 'stvincent', 'florencethemachine', 'neutralmilkhotel', 'kooks', 'lordhuron', 'phoebebridgers', 'macdemarco', 'rem', 'newpornographers', 'toroymoi', 'cityandcolour', 'greatlakeswimmers', 'menitrust', 'national', 'tomodell', 'ohwonder', 'ofmontreal', 'wainwright', 'fatherjohnmisty'])

In the following nested for loop I reformat the song names (removing punctuation and symbols) in order for the song name to be appropriately formatted for looping through the urls.

In [5]:

for key in folk_songs_dict:
    
    folk_songs = []
    
    for x in folk_songs_dict[key]:
        x = x.replace(" ", "").lower()
        x = x.replace("(", "")
        x = x.replace(")", "")
        x = x.replace(".", "")
        x = x.replace(",", "")
        x = x.replace("'", "")
        x = x.replace("/", "")
        x = x.replace("?", "")
        x = x.replace("#", "")
        x = x.replace("$", "s")
        x = x.replace("&", "and")
        x = x.replace(":", "")
        x = x.replace("-", "")
        
        # if statement to prevent duplicate song names from being added
        if x not in folk_songs:
                folk_songs.append(x)
            
        
    folk_songs_dict[key] = folk_songs

Quickly taking a look that the formatting looks right...

In [6]:
folk_songs_dict

{'johndenver': ['theloveofcommonpeople',
  'catchanotherbutterfly',
  'daydream',
  'theballadofspiroagnew',
  'circus',
  'whenimsixtyfour',
  'rhymesandreasons',
  'yellowcat',
  'isabel',
  'youdunstompedonmyheart',
  'myoldman',
  'iwishiknewhowitwouldfeeltobefree',
  'todayisthefirstdayoftherestofmylife',
  'takemetotomorrow',
  'followme',
  'forestlawn',
  'aspenglow',
  'amsterdam',
  'anthemrevelation',
  'stickysummerweather',
  'carolinainmymind',
  'jimmynewman',
  'molly',
  'trembleifyoumust',
  'sailawayhome',
  'thenighttheydroveolddixiedown',
  'mrbojangles',
  'iwishicouldhavebeentherewoodstock',
  'whosegardenwasthis',
  'thegameisover',
  'eleanorrigby',
  'oldfolks',
  'goldenslumbers',
  'sweetsweetlife',
  'jinglebells',
  'poemsprayersandpromises',
  'letitbe',
  'mysweetlady',
  'woodenindian',
  'junk',
  'gospelchanges',
  'takemehomecountryroads',
  'iguesshedratherbeincolorado',
  'sunshineonmyshoulders',
  'aroundandaround',
  'fireandrain',
  'thebox',
  

## Lyric webscraping

In [7]:
# url to scrape the lyrics from
base_url = "https://www.azlyrics.com/lyrics/{}/{}.html"


In the code below I use Zytes autoextract package in order to scrape the song lyrics from AZ-lyrics.

In [28]:
from autoextract.sync import request_raw
# creating the empty dictionary for artists, songs, and lyrics
lyrics_dict = {'lyrics': [], 'song': [], 'artist': []}

# looping through each artist
for artists in folk_songs_dict:
    # looping through each song
    for songs in folk_songs_dict[artists]:
        
        # changing the url each time with the next song and artist
        final_url = base_url.format(artists, songs)
        # the webscraping part. I use a try statement so 
        # that the loop does not stop if it fails to collect some lyrics
        try:
            query = [{
                'url': final_url,
                'pageType': 'article'
            }]
            results = request_raw(query, api_key='e5e16db9e885448eac36b4d6f130cd61')
            
            # getting just the article body (the lyrics themselves)
            lyric = results[0]['article']['articleBody']
            
            # splitting the lyrics into an array of lyric lines
            cleaned_array = lyric.split('\n')
            
            # appending the data to the dictionary
            lyrics_dict['lyrics'].append(cleaned_array)
            lyrics_dict['song'].append(songs)
            lyrics_dict['artist'].append(artists)
            
            # confirmation and update statement
            print("# lyrics acquired: ", len(lyrics_dict['song']))
            
        except: 
            "Missed one, but I'll keep going..." # forgot to put a print function here lol




# lyrics acquired:  1
# lyrics acquired:  2
# lyrics acquired:  3
# lyrics acquired:  4
# lyrics acquired:  5
# lyrics acquired:  6
# lyrics acquired:  7
# lyrics acquired:  8
# lyrics acquired:  9
# lyrics acquired:  10
# lyrics acquired:  11
# lyrics acquired:  12
# lyrics acquired:  13
# lyrics acquired:  14
# lyrics acquired:  15
# lyrics acquired:  16
# lyrics acquired:  17
# lyrics acquired:  18
# lyrics acquired:  19
# lyrics acquired:  20
# lyrics acquired:  21
# lyrics acquired:  22
# lyrics acquired:  23
# lyrics acquired:  24
# lyrics acquired:  25
# lyrics acquired:  26
# lyrics acquired:  27
# lyrics acquired:  28
# lyrics acquired:  29
# lyrics acquired:  30
# lyrics acquired:  31
# lyrics acquired:  32
# lyrics acquired:  33
# lyrics acquired:  34
# lyrics acquired:  35
# lyrics acquired:  36
# lyrics acquired:  37
# lyrics acquired:  38
# lyrics acquired:  39
# lyrics acquired:  40
# lyrics acquired:  41
# lyrics acquired:  42
# lyrics acquired:  43
# lyrics acquired:  

Now I will transform the dictionary into a dataframe in order to take a look.

In [32]:
folk_lyrics_df = pd.DataFrame({'lyrics':lyrics_dict['lyrics'], 'song':lyrics_dict['song'], 'artist':lyrics_dict['artist']})

In [33]:
folk_lyrics_df

Unnamed: 0,lyrics,song,artist
0,[Living on free food tickets. Water in the mil...,theloveofcommonpeople,johndenver
1,"[Do you remember days not so very long ago, wh...",catchanotherbutterfly,johndenver
2,"[Yes, I been dreaming about you every day, eac...",daydream,johndenver
3,[I'll sing you a song of Spiro Agnew and all t...,theballadofspiroagnew,johndenver
4,"[If I look like home to you, if I am your sign...",circus,johndenver
...,...,...,...
7699,"[Ancient holy wars, dead religions, holocausts...",thisatombombandme,fatherjohnmisty
7700,[It's less hard than it should be to find a de...,thisisamerica,fatherjohnmisty
7701,"[I found you once, and I'll find you again, Yo...",tor,fatherjohnmisty
7702,"[Flyin' on past in your voodoo mask, High on y...",tos,fatherjohnmisty


Looks great! Now I'll save it to a csv file so I can open it later for data cleaning.

In [37]:
folk_lyrics_df.to_csv(path_or_buf='Desktop/lyric_files/folk_lyrics.csv')

------------------
