In [3]:
import jl_io as io

In this file, we store out private [Scraper Api](https://www.scraperapi.com/) key. You can make 1000 request for free if you want to test

In [4]:
!ls ../data/private

scraper_api_key.pickle


In [5]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from functools import reduce
import time
import random

class ScraperApiSongExtractor:
    BASE_ULTIMATE_GUITAR_URL = "https://www.ultimate-guitar.com/explore?&type[]=Chords";
    MAX_NUM_PAGES = 20
    
    def __init__(self, apiKey):
        random.seed()
 
        self.BASE_URL = f"http://api.scraperapi.com?api_key={apiKey}&url={self.BASE_ULTIMATE_GUITAR_URL}"
        self.driver = self.create_chrome_driver()
    
    def get_all_songs(self, genre, style, decade):
        links = self.get_all_filter_song_links(genre['pattern'], style['pattern'], decade['pattern'])
        result = [ self.link_to_song_dict(link,genre,style,decade) for link in links]
        
        return result
    
    def link_to_song_dict(self,link, genre, style, decade):
        return {
            "name": link.contents[0],
            "url": link['href'],
            "genre": genre["name"],
            "style": style["name"],
            "decade": decade["name"]
        }
    
    def get_all_filter_song_links(self,genreFilter,styleFilter,decadeFilter):
        list_of_list = [self.get_links_single_page(genreFilter, styleFilter, decadeFilter, f'&page={page}')
                  for page in range(1,self.MAX_NUM_PAGES+1)]
        
        return reduce(lambda list1, list2: [*list1, *list2], list_of_list)
    
    def get_links_single_page(self,genreFilter,styleFilter,decadeFilter, pageFilter):       
        self.driver.get(f'{self.BASE_URL}{genreFilter}{styleFilter}{decadeFilter}')
        
        if self.driver.page_source == '<html><head></head><body></body></html>':
            raise Exception('Denegation error')

        soup = BeautifulSoup(self.driver.page_source, 'lxml')

        return soup.findAll('a', {"class":"_2KJtL _1mes3 kWOod"})          
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver


In [6]:
genres = io.from_file('../data/genres')
styles = io.from_file('../data/styles')
decades = io.from_file('../data/decades')

In [41]:
extractor = ScraperApiSongExtractor(io.from_file('../data/private/scraper_api_key.pickle'))

In [44]:
songs = extractor.get_all_songs(genres[0],styles[0],decades[0])

In [43]:
songs

[{'name': 'All I Want',
  'url': 'https://tabs.ultimate-guitar.com/tab/kodaline/all-i-want-chords-1180259',
  'genre': 'Rock',
  'style': 'Pop Rock',
  'decade': '2010s'},
 {'name': 'Radioactive',
  'url': 'https://tabs.ultimate-guitar.com/tab/imagine-dragons/radioactive-chords-1171909',
  'genre': 'Rock',
  'style': 'Pop Rock',
  'decade': '2010s'},
 {'name': 'Say Something (ver\xa03)',
  'url': 'https://tabs.ultimate-guitar.com/tab/a_great_big_world/say_something_chords_1443639',
  'genre': 'Rock',
  'style': 'Pop Rock',
  'decade': '2010s'},
 {'name': 'Counting Stars',
  'url': 'https://tabs.ultimate-guitar.com/tab/onerepublic/counting-stars-chords-1233464',
  'genre': 'Rock',
  'style': 'Pop Rock',
  'decade': '2010s'},
 {'name': 'Million Reasons',
  'url': 'https://tabs.ultimate-guitar.com/tab/lady-gaga/million-reasons-chords-1884102',
  'genre': 'Rock',
  'style': 'Pop Rock',
  'decade': '2010s'},
 {'name': 'Believer (ver\xa02)',
  'url': 'https://tabs.ultimate-guitar.com/tab/ima