## Scrape of top 10 songs for artists in LastFM

### Imports

In [17]:
import pandas as pd

from multiprocessing import cpu_count


from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from webdriver_manager.firefox import GeckoDriverManager
from joblib import Parallel, delayed
from os.path import isfile

import time
import json
import warnings
warnings.filterwarnings('ignore')


#### Driver Options

In [18]:
PATH=FirefoxService(GeckoDriverManager().install())
binary = FirefoxBinary(r'C:\Program Files\Mozilla Firefox\Firefox.exe')

opciones=Options()
#opciones.headless = True
#opciones.add_argument('--incognito')

[WDM] - Downloading: 16.9kB [00:00, 17.3MB/s]                   


#### Driver Shorthand *(Deprecated)*

In [19]:
# #find element
# f = lambda x, y=driver : y.find_element(By.XPATH, x)
# #find elements
# fs = lambda x, y=driver : y.find_elements(By.XPATH, x)
# #find and prepare to click
# fc = lambda x, y=driver : y.find_element(By.XPATH, x).click()

#### Scraping Utilities

In [20]:
url = 'https://www.last.fm/music/'

# HTML XPATH values
cookies_XPATH = '//*[@id="onetrust-accept-btn-handler"]'
change_period_XPATH = '//*[@id="top-tracks"]/div[1]/button[2]'
all_songs_XPATH = '//*[@id="top-tracks-sort"]/li[6]/a'
row_XPATH = '//*[@id="top-tracks"]/div[2]/table/tbody//tr'

In [21]:
#Cleaning functions (for use in the scrapper)
def trim_row(row:list):
    '''
    Only used as part of scrapper function
    
    Selects only the rows that we want from the table
    '''
    new = []
    new.append(row[0])
    new.append(row[4])
    new.append(row[7])
    return new


def row_clean(row:list):
    '''
    Only used as part of scrapper function
    
    Cleans up the row items to a better format
    '''
    new = []
    new.append(int(row[0]))
    new.append(row[1])
    temp = row[2].replace(',','')
    new.append(int(temp.split()[0]))
    return new

### Scraper

In [22]:
top_10_paths = dict() # For safekeeping paths
cols = ['Rank','Track_Name','Listeners'] # Columns for our dataframe
fail_dict = {}  # For keeping track of failed attemps


def top_10_scrapper(artist:str, error_count = 0):
    '''
    Scrapes through "lastfm.com" to find the top 10 most listened to
    songs for the given artist. Saves the information to parquet.
    
    Method:
    1) Navigates the website (accepts cookies, selects table)
    2) Clean table rows and add to pandas dataframe
    3) Export to .parquet and saves the file path to top_10_paths
    
    '''
    
    artist = artist.replace('/',' ') # AC/DC giving me problems............
    artist = artist.replace(' ','_') # For filemaking
    art = url+artist.replace('_','+') # URL search
    
    path = '../data/tops/' + artist + '.parquet' # Path
    top_10_paths[artist] = path # Store path
    
    if isfile(path):
        print(artist, 'already exists in database')
        return 
    
    
    # Initialize driver
    driver = webdriver.Firefox(firefox_binary=binary,options = opciones)
    driver.get(art)
    # Accept cookies
    time.sleep(7)
    try:
        driver.find_element(By.XPATH, cookies_XPATH).click()
        
    except:
        error_count +=1
        if error_count < 3:
            print('cookie click error for artist: ',artist)
            driver.quit()
            return top_10_scrapper(artist, error_count)
        else:
            print('Error handling (cookies): ', artist)
            fail_dict[artist] = 1
            driver.quit()
            return
    # Select top 10 of all time
    time.sleep(1)
    driver.find_element(By.XPATH, change_period_XPATH).click()
    time.sleep(1)
    driver.find_element(By.XPATH, all_songs_XPATH).click()
    time.sleep(1)
    # Select the top 10 table
    #elements = list(map(row_clean,[[j.text for j in row] for row in [trim_row(i.find_elements(By.XPATH, 'td')) for i in driver.find_elements(By.XPATH, all_songs_XPATH)]]))
    elements = list(map(row_clean,[[j.text for j in trim_row(i.find_elements(By.XPATH,'td'))] for i in driver.find_elements(By.XPATH, row_XPATH)]))
    # insert into dataframe and add artist name for safekeeping
    time.sleep(2)
    driver.quit()
    df = pd.DataFrame(elements,columns=cols)
    df['Artist'] = artist

    # To parquet for safekeeping
    df.to_parquet( path ,index=False)
    
    print('Uploaded ' + artist)
    return 

In [23]:
artists = pd.read_parquet(r'..\data\wikitop100list\artists.parquet')['Artist']
artists.head()

0        The Beatles
1      Elvis Presley
2    Michael Jackson
3         Elton John
4              Queen
Name: Artist, dtype: object

In [24]:
def main(cpus):
    Parallel(n_jobs=cpus, verbose=False,prefer="threads")(delayed(top_10_scrapper)(x) for x in artists)

In [25]:
main(max(cpu_count()-4,2))

The_BeatlesMichael_Jackson already exists in database
Elvis_Presley already exists in database
Elton_John already exists in database
Queen already exists in database
Madonna already exists in database
Led_Zeppelin already exists in database
Rihanna already exists in database
Pink_Floyd already exists in database
Eminem already exists in database
 already exists in database
Mariah_Carey already exists in database
Taylor_Swift already exists in database
Beyoncé already exists in database
Whitney_Houston already exists in database
Eagles already exists in database
Celine_Dion already exists in database
The_Rolling_Stones already exists in database
AC_DC already exists in database
Drake already exists in database
Garth_Brooks already exists in database
Kanye_West already exists in database
Billy_Joel already exists in database
Justin_Bieber already exists in database
Ed_Sheeran already exists in database
Bruno_Mars already exists in database
Bruce_Springsteen already exists in database
Fra

In [26]:
with open('../data/tops/_paths.json', 'w') as fp: # Store paths for easy data access
    json.dump(top_10_paths, fp)