# In this notebook I use my dictionary of pop artists mapped to songs in order to scrape the song lyrics for each song and artist.

I'll start by importing the required packages.

In [1]:
import requests
from time import sleep
import csv
import json
import re
import pandas as pd
import numpy as np

## Formatting, Editing, Filtering for Lyric Webscraping

In [2]:
# file containing artists - songs mapping
folk_songs_json = "Pop-Artists-Songs-Mapping.json"
folk_songs_dict = {}

with open(folk_songs_json) as file:
    pop_songs_dict = json.load(file)
    

In [3]:
# checking the keys (artist names)
pop_songs_dict.keys()

dict_keys(['Taylor Swift', 'Lady Gaga', 'Beyonce (Beyoncé)', 'Ariana Grande', 'Justin Bieber', 'Katy Perry', 'Michael Jackson', 'Billie Eilish', 'Madonna', 'Selena Gomez', 'Rihanna', 'Miley Cyrus', 'Dua Lipa', 'Demi Lovato', 'Lana Del Rey', 'Bruno Mars', 'Britney Spears', 'Ed Sheeran', 'Shawn Mendes', 'Adele', 'Halsey', 'Justin Timberlake', 'Maroon 5', 'Pink (P!nk)', 'Sam Smith', 'One Direction', 'Lorde', 'Ellie Goulding', 'Beach Boys', 'Elton John', 'The Beatles', 'Ava Max', 'Harry Styles', 'Sia', 'Christina Aguilera', 'Khalid', 'Imagine Dragons', 'Backstreet Boys', 'Jennifer Lopez', 'Janet Jackson', 'Cher', 'Whitney Houston', 'Jackson 5', 'Prince', 'Tina Turner', 'Kelly Clarkson', "'N Sync", 'Gwen Stefani', 'Black Eyed Peas', 'Jonas Brothers', 'Usher', 'Sheryl Crow', 'New Kids On The Block', 'Phil Collins', 'Fergie', 'Amy Winehouse', 'ABBA', 'Jessica Simpson', 'Nick Jonas', 'George Michael', 'Avril Lavigne', 'John Mayer', 'Kesha (Ke$ha)', 'Toto', 'Lizzo', 'Carly Rae Jepsen', 'Twenty 

Now I have to reformat the artist names in the dictionary to the url appropriate names...

In [4]:
pop_artists_for_link = ['taylorswift', 'ladygaga', 'knowles', 'arianagrande', 'justinbieber',
                    'katyperry', 'jackson', 'billieeilish', 'madonna', 'selenagomez',
                    'rihanna', 'mileycyrus', 'dualipa', 'demilovato', 'lanadelrey',
                    'brunomars', 'spears', 'edsheeran', 'shawnmendes', 'adele', 
                    'halsey', 'timberlake', 'maroon5', 'pink', 
                    'samsmith', 'onedirection', 'lorde', 'elliegoulding', 'beachboys', 
                    'john', 'beatles', 'avamax', 'harrystyles', 'sia', 'aguilera',
                    'khalid', 'imaginedragons', 'bsb', 'lopez', 'janete',
                    'cher', 'houston', 'jackson5', 'prince', 'turner', 'clarkson',
                    "nsync", 'gwenstefani', 'blackeyedpeas', 'jonasbrothers', 'usher',
                    'crow', 'newkids', 'collins',
                    'fergie', 'amywinehouse', 'abba', 'simpson', 'nickjonas', 'michael',
                    'lavigne', 'mayer', 'keha', 'toto', 'lizzo', 
                    'carlyraejepsen', 'twentyonepilots', 'fifthharmony', 'robthomas',
                    'chainsmokers', 'jamesblunt', 'bedingfieldn', 'charlieputh', 
                    'ritaora', 'charlixcx', 'fun', 'niallhoran', 'lewiscapaldi',
                    'dnce', 'lukasgraham', 'tovelo', 'jamesarthur', 'banks', 'jessiereyez',
                    'maggierogers', 'mabel', 'avamax', 'kiiara', 'ellamai', 'annemarie',
                    'alessiacara', 'juliamichaels', 'torikelly', 'knowless']

  
# changing keys of dictionary to formatting for link
pop_songs_dict = dict(zip(pop_artists_for_link, list(pop_songs_dict.values())))
pop_songs_dict.keys()

dict_keys(['taylorswift', 'ladygaga', 'knowles', 'arianagrande', 'justinbieber', 'katyperry', 'jackson', 'billieeilish', 'madonna', 'selenagomez', 'rihanna', 'mileycyrus', 'dualipa', 'demilovato', 'lanadelrey', 'brunomars', 'spears', 'edsheeran', 'shawnmendes', 'adele', 'halsey', 'timberlake', 'maroon5', 'pink', 'samsmith', 'onedirection', 'lorde', 'elliegoulding', 'beachboys', 'john', 'beatles', 'avamax', 'harrystyles', 'sia', 'aguilera', 'khalid', 'imaginedragons', 'bsb', 'lopez', 'janete', 'cher', 'houston', 'jackson5', 'prince', 'turner', 'clarkson', 'nsync', 'gwenstefani', 'blackeyedpeas', 'jonasbrothers', 'usher', 'crow', 'newkids', 'collins', 'fergie', 'amywinehouse', 'abba', 'simpson', 'nickjonas', 'michael', 'lavigne', 'mayer', 'keha', 'toto', 'lizzo', 'carlyraejepsen', 'twentyonepilots', 'fifthharmony', 'robthomas', 'chainsmokers', 'jamesblunt', 'bedingfieldn', 'charlieputh', 'ritaora', 'charlixcx', 'fun', 'niallhoran', 'lewiscapaldi', 'dnce', 'lukasgraham', 'tovelo', 'jamesa

In the following nested for loop I reformat the song names (removing punctuation and symbols) in order for the song name to be appropriately formatted for looping through the urls.

In [5]:

for key in pop_songs_dict:
    
    pop_songs = []
    
    for x in pop_songs_dict[key]:
        x = x.replace(" ", "").lower()
        x = x.replace("(", "")
        x = x.replace(")", "")
        x = x.replace(".", "")
        x = x.replace(",", "")
        x = x.replace("'", "")
        x = x.replace("/", "")
        x = x.replace("?", "")
        x = x.replace("#", "")
        x = x.replace("$", "s")
        x = x.replace("&", "and")
        x = x.replace(":", "")
        x = x.replace("-", "")
        
        # if statement to prevent duplicate song names from being added
        if x not in pop_songs:
                pop_songs.append(x)
            
        
    pop_songs_dict[key] = pop_songs

Quickly taking a look that the formatting looks right...

In [6]:
pop_songs_dict

{'taylorswift': ['timmcgraw',
  'picturetoburn',
  'teardropsonmyguitar',
  'aplaceinthisworld',
  'coldasyou',
  'theoutside',
  'tiedtogetherwithasmile',
  'staybeautiful',
  'shouldvesaidno',
  'maryssongohmymymy',
  'oursong',
  'imonlymewhenimwithyoudeluxeedition',
  'invisibledeluxeedition',
  'aperfectlygoodheartdeluxeedition',
  'lastchristmas',
  'christmaseswhenyouweremine',
  'santababy',
  'silentnight',
  'christmasmustbesomethingmore',
  'whitechristmas',
  'jumpthenfallplatinumedition',
  'untouchableplatinumedition',
  'foreverandalwayspianoversionplatinumedition',
  'comeinwiththerainplatinumedition',
  'superstarplatinumedition',
  'theothersideofthedoorplatinumedition',
  'fearless',
  'fifteen',
  'lovestory',
  'heystephen',
  'whitehorse',
  'youbelongwithme',
  'breathe',
  'tellmewhy',
  'yourenotsorry',
  'thewayilovedyou',
  'foreverandalways',
  'thebestday',
  'change',
  'mine',
  'sparksfly',
  'backtodecember',
  'speaknow',
  'dearjohn',
  'mean',
  'the

In [7]:
# url to scrape the lyrics from
base_url = "https://www.azlyrics.com/lyrics/{}/{}.html"



In the code below I use Zyte's autoextract package in order to scrape the song lyrics from AZ-lyrics.

In [9]:
from autoextract.sync import request_raw

# creating the empty dictionary for artists, songs, and lyrics
lyrics_dict = {'lyrics': [], 'song': [], 'artist': []}

# looping through each artist
for artists in pop_songs_dict:
    
    # looping through each song
    for songs in pop_songs_dict[artists]:
        
        # changing the url each time with the next song and artist
        final_url = base_url.format(artists, songs)
        
         # the webscraping part. I use a try statement so 
        # that the loop does not stop if it fails to collect some lyrics
        try:
            query = [{
                'url': final_url,
                'pageType': 'article'
            }]
            results = request_raw(query, api_key='e5e16db9e885448eac36b4d6f130cd61')
            
            # getting just the article body (the lyrics themselves)
            lyric = results[0]['article']['articleBody']
            
            # splitting the lyrics into an array of lyric lines
            cleaned_array = lyric.split('\n')
            
            # appending the data to the dictionary
            lyrics_dict['lyrics'].append(cleaned_array)
            lyrics_dict['song'].append(songs)
            lyrics_dict['artist'].append(artists)
            
            # confirmation and update statement
            print("# lyrics acquired: ", len(lyrics_dict['song']))
        except: 
            print("Missed one, but I'll keep going...")



# lyrics acquired:  1
# lyrics acquired:  2
# lyrics acquired:  3
# lyrics acquired:  4
# lyrics acquired:  5
# lyrics acquired:  6
# lyrics acquired:  7
# lyrics acquired:  8
# lyrics acquired:  9
# lyrics acquired:  10
# lyrics acquired:  11
Missed one, but I'll keep going...
Missed one, but I'll keep going...
Missed one, but I'll keep going...
# lyrics acquired:  12
# lyrics acquired:  13
# lyrics acquired:  14
# lyrics acquired:  15
# lyrics acquired:  16
# lyrics acquired:  17
Missed one, but I'll keep going...
Missed one, but I'll keep going...
Missed one, but I'll keep going...
Missed one, but I'll keep going...
Missed one, but I'll keep going...
Missed one, but I'll keep going...
# lyrics acquired:  18
# lyrics acquired:  19
# lyrics acquired:  20
# lyrics acquired:  21
# lyrics acquired:  22
# lyrics acquired:  23
# lyrics acquired:  24
# lyrics acquired:  25
# lyrics acquired:  26
# lyrics acquired:  27
Missed one, but I'll keep going...
# lyrics acquired:  28
# lyrics acquir

In [10]:
# assigning the dictionary to a dataframe in order to take a quick look at the data
pop_lyrics_df = pd.DataFrame({'lyrics':lyrics_dict['lyrics'], 'song':lyrics_dict['song'], 'artist':lyrics_dict['artist']})

In [11]:
pop_lyrics_df

Unnamed: 0,lyrics,song,artist
0,"[He said the way my blue eyes shined, Put thos...",timmcgraw,taylorswift
1,"[State the obvious, I didn't get my perfect fa...",picturetoburn,taylorswift
2,"[Taylor Swift Lyrics, , ""Teardrops On My Guita...",teardropsonmyguitar,taylorswift
3,"[Taylor Swift Lyrics, , ""A Place In This World...",aplaceinthisworld,taylorswift
4,"[You have a way of coming easily to me, And wh...",coldasyou,taylorswift
...,...,...,...
6963,"[Sorry I'm not so merry, But I feel like this ...",loneliesttimeofyear,mabel
6964,"[Straight up, Tell me everything you've been t...",myboymytown,mabel
6965,"[[Yungen:], Young Mabes, Yungen, One shot, , L...",oneshotremix,mabel
6966,"[I'm thinking of you, I'm thinking of you, I'm...",thinkingofyou,mabel


Finally, I will save the DataFrame to a csv file so that I can import it later in another notebook for the data cleaning process.

In [12]:
pop_lyrics_df.to_csv(path_or_buf='Desktop/lyric_files/pop_lyrics.csv')