## Scrape of Lyrics from Top Tracks of Top Artists

### Imports

In [1]:
import pandas as pd
import re
from multiprocessing import cpu_count


from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from webdriver_manager.firefox import GeckoDriverManager
from joblib import Parallel, delayed
from os.path import isfile

import time
import json
import warnings
warnings.filterwarnings('ignore')


#### Driver Options

In [2]:
PATH=FirefoxService(GeckoDriverManager().install())
binary = FirefoxBinary(r'C:\Program Files\Mozilla Firefox\Firefox.exe')

opciones=Options()
#opciones.headless = True
#opciones.add_argument('--incognito')

[WDM] - Downloading: 16.9kB [00:00, 17.2MB/s]                   


### Load artist file and clean data

In [3]:
with open('../data/tops/_paths.json', 'r') as fp: # Load paths
    artist_paths = json.load(fp)

In [4]:
artist_paths['Beyonce'] = artist_paths['Beyoncé']
del artist_paths['Beyoncé']
artist_paths['Earth_Wind_And_Fire'] = artist_paths['Earth,_Wind_&_Fire']
del artist_paths['Earth,_Wind_&_Fire']
artist_paths['Simon and Garfunkel'] = artist_paths['Simon_&_Garfunkel']
del artist_paths['Simon_&_Garfunkel']

## Main function together with helper functions

#### Lyrics extracted from Genius/AZlyrics

In [6]:
# url = 'https://www.azlyrics.com/lyrics/' 
# lyricAZ_XPATH = '/html/body/div[2]/div[2]/div[2]/div[5]'
fail_dic = {}
url2 = 'https://genius.com/'
genius_XPATH = '/html/body/div[1]/main/div[2]/div[2]/div[2]/div/div[2]' #Example: https://genius.com/Beyonce-drunk-in-love-lyrics

def lyric_path(artist): # Prints target path for lyrics file
    return '../data/lyrics/' + artist + '.json'

def load_lyrics(artist): # Loads lyrics file
    if isfile(lyric_path(artist)):
        with open(lyric_path(artist), 'r') as fp:
            total_unique_lyrics = json.load(fp)
    else:
        total_unique_lyrics = {'songs_considered' : [] , 'distinct_words': []}
    return total_unique_lyrics

def initialize_dataframe(df,artist): # Starts the dataframe and excludes duplicate work
    if 'Unique_Word_Count' in df.columns:
        df = df[df['Unique_Word_Count'].isna()]
        if len(df) == 0 :
            print(artist, 'has already been fully uploaded')
            return None
    else:
        df['Unique_Word_Count'] = pd.NA
    return df , load_lyrics(artist)


def song_lyric_count(artist):
    '''
    Takes each song for each artist, extracts the lyrics, counts the unique words 
    and returns a JSON file with the information.
    Also adds a new column to the original data with unique words per song.
    '''
    top_hits , total_unique_lyrics = initialize_dataframe(pd.read_parquet(artist_paths[artist]),artist)
    path_to_lyrics = lyric_path(artist)
    
    song_urls = []
    for ind,track in enumerate(top_hits['Track_Name']):
        song_urls.append((url2 + re.sub('[_]','-',re.sub('[^A-Za-z0-9-_]+', '',  artist)).lower() + '-' + re.sub('[_ ]','-',re.sub('[^A-Za-z0-9-_ ]+', '', track)).lower() + '-lyrics', track, ind ))
        
    fail_counter = 0
    for i in song_urls:
        driver = webdriver.Firefox(firefox_binary=binary,options = opciones)
        try:
            driver.get(i[0])
            time.sleep(2)
            lyrics = driver.find_element(By.XPATH, genius_XPATH).text.split()
            unique_lyrics = set()
            for k in lyrics:
                j = re.sub('\'s','',k)
                j = re.sub('[^A-Za-z0-9]+', '', j)
                unique_lyrics.add(j)
            
            total_unique_lyrics['songs_considered'].append(i[1])
            total_unique_lyrics['distinct_words'].append(list(unique_lyrics))
            
            top_hits.at[i[2],'Unique_Word_Count'] = len(unique_lyrics)
            print('Uploaded: ',artist, i[1] )
            driver.quit()
        except:
            fail_dic[artist] = i[1]
            fail_counter +=1
            print('Search failed: ',artist, i[1] )
            driver.quit()
    top_hits.to_parquet(artist_paths[artist],index=False)
    with open(path_to_lyrics, 'w') as fp: 
        json.dump(total_unique_lyrics, fp)
    
    print('Uploaded', artist, 'with',fail_counter, 'fails')
    if fail_counter == 10:
        return False
    return True

In [7]:
def main(cpus): # Parellized main function
    Parallel(n_jobs=cpus, verbose=False,prefer="threads")(delayed(song_lyric_count)(i) for i in artist_paths)

In [8]:
main(max(cpu_count()-4,2))

Uploaded:  The_Beatles Come Together
Uploaded:  Elvis_Presley Can't Help Falling in Love
Uploaded:  Queen Bohemian Rhapsody
Uploaded:  Elton_John Your Song
Uploaded:  Rihanna Umbrella
Uploaded:  Madonna Hung Up
Uploaded:  Led_Zeppelin Stairway to Heaven
Uploaded:  Michael_Jackson Billie Jean
Uploaded:  Rihanna Don't Stop the Music
Uploaded:  The_Beatles Let It Be
Uploaded:  Elton_John Tiny Dancer
Uploaded:  Madonna Like a Prayer
Uploaded:  Queen Another One Bites the Dust
Uploaded:  Elvis_Presley Jailhouse Rock
Uploaded:  Led_Zeppelin Immigrant Song
Uploaded:  Rihanna Disturbia
Uploaded:  Madonna Material Girl
Uploaded:  The_Beatles Eleanor Rigby
Uploaded:  Queen Don't Stop Me Now
Uploaded:  Michael_Jackson Beat It
Uploaded:  Elton_John I'm Still Standing
Uploaded:  Elvis_Presley Suspicious Minds
Uploaded:  Led_Zeppelin Whole Lotta Love
Uploaded:  The_Beatles Yesterday
Uploaded:  Queen We Will Rock You
Uploaded:  Madonna Like a Virgin
Uploaded:  Led_Zeppelin Black Dog
Uploaded:  Elton_