# Ultimate Guitar Scraping. Second parts: extracting song urls

In [15]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

class PageLinksExtractor:
    PAGE_URL = "https://www.ultimate-guitar.com/explore?&subgenres[]=24&type[]=Chords";
    
    def __init__(self):
        pass
    
    def get_all_filter_song_links(self):
        driver = self.create_chrome_driver();
        driver.get(self.PAGE_URL)
        
        self.click_on_accept_cookies(driver)
        
        soup = BeautifulSoup(driver.page_source, 'lxml')

        return soup.findAll('a', {"class":"_2KJtL _1mes3 kWOod"})
    
    def click_on_accept_cookies(self, driver):
        button = driver.find_element_by_xpath('//button[contains(text(), "thanks")]')

        button.click()  
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver
        

In [16]:
extractor = PageLinksExtractor()

In [18]:
links = extractor.get_all_filter_song_links()

In [19]:
len(links)

50

Vemos que se muestran 50 canciones por página. Vamos a intentar extraer su URL y su nombre

In [20]:
links[0]

<a class="_2KJtL _1mes3 kWOod" href="https://tabs.ultimate-guitar.com/tab/elvis-presley/cant-help-falling-in-love-chords-1086983" state="primary">Cant Help Falling In Love</a>

In [24]:
links[0]['href']

'https://tabs.ultimate-guitar.com/tab/elvis-presley/cant-help-falling-in-love-chords-1086983'

In [31]:
links[0].contents[0]

'Cant Help Falling In Love'

In [33]:
[ (link['href'],link.contents[0]) for link in links]

[('https://tabs.ultimate-guitar.com/tab/elvis-presley/cant-help-falling-in-love-chords-1086983',
  'Cant Help Falling In Love'),
 ('https://tabs.ultimate-guitar.com/tab/the_beatles/yesterday_chords_17450',
  'Yesterday'),
 ('https://tabs.ultimate-guitar.com/tab/the_beatles/let_it_be_chords_60690',
  'Let It Be (ver\xa02)'),
 ('https://tabs.ultimate-guitar.com/tab/jason-mraz/im-yours-chords-373896',
  'Im Yours (ver\xa08)'),
 ('https://tabs.ultimate-guitar.com/tab/john-lennon/imagine-chords-9306',
  'Imagine'),
 ('https://tabs.ultimate-guitar.com/tab/the_cranberries/zombie_chords_844902',
  'Zombie'),
 ('https://tabs.ultimate-guitar.com/tab/elton-john/your-song-chords-29113',
  'Your Song'),
 ('https://tabs.ultimate-guitar.com/tab/the_beatles/hey_jude_chords_1061739',
  'Hey Jude (ver\xa06)'),
 ('https://tabs.ultimate-guitar.com/tab/kodaline/all-i-want-chords-1180259',
  'All I Want'),
 ('https://tabs.ultimate-guitar.com/tab/coldplay/viva-la-vida-chords-675427',
  'Viva La Vida'),
 ('ht

## Let's parametrise the url

In [1]:
import jl_io as io

In [3]:
genres = io.from_file('../data/genres')
styles = io.from_file('../data/styles')
decades = io.from_file('../data/decades')

In [4]:
genres

[{'code': '4', 'name': 'Rock', 'pattern': '&&genres[]=4'},
 {'code': '666', 'name': 'Folk', 'pattern': '&&genres[]=666'},
 {'code': '14', 'name': 'Pop', 'pattern': '&&genres[]=14'},
 {'code': '49', 'name': 'Country', 'pattern': '&&genres[]=49'},
 {'code': '16', 'name': 'Electronic', 'pattern': '&&genres[]=16'},
 {'code': '70', 'name': 'Rhythm And Blues', 'pattern': '&&genres[]=70'},
 {'code': '8', 'name': 'Metal', 'pattern': '&&genres[]=8'},
 {'code': '434', 'name': 'Contemporary R&b', 'pattern': '&&genres[]=434'},
 {'code': '1016', 'name': 'Religious Music', 'pattern': '&&genres[]=1016'},
 {'code': '45', 'name': 'Hip Hop', 'pattern': '&&genres[]=45'},
 {'code': '19', 'name': 'Reggae', 'pattern': '&&genres[]=19'},
 {'code': '84', 'name': 'Jazz', 'pattern': '&&genres[]=84'},
 {'code': '99', 'name': 'Blues', 'pattern': '&&genres[]=99'},
 {'code': '195', 'name': 'World Music', 'pattern': '&&genres[]=195'},
 {'code': '85', 'name': 'Disco', 'pattern': '&&genres[]=85'},
 {'code': '79', 'name

In [5]:
styles

[{'code': '24', 'name': 'Pop Rock', 'pattern': '&&subgenres[]=24'},
 {'code': '665', 'name': 'Singer-songwriter', 'pattern': '&&subgenres[]=665'},
 {'code': '3', 'name': 'Alternative Rock', 'pattern': '&&subgenres[]=3'},
 {'code': '9', 'name': 'Indie Rock', 'pattern': '&&subgenres[]=9'},
 {'code': '46', 'name': 'Indie Pop', 'pattern': '&&subgenres[]=46'},
 {'code': '1', 'name': 'Pop Punk', 'pattern': '&&subgenres[]=1'},
 {'code': '7', 'name': 'Hard Rock', 'pattern': '&&subgenres[]=7'},
 {'code': '813', 'name': 'Contemporary Folk', 'pattern': '&&subgenres[]=813'},
 {'code': '94', 'name': 'Folk Rock', 'pattern': '&&subgenres[]=94'},
 {'code': '1093', 'name': 'Folk Pop', 'pattern': '&&subgenres[]=1093'},
 {'code': '1087',
  'name': 'Contemporary Country',
  'pattern': '&&subgenres[]=1087'},
 {'code': '197', 'name': 'Country Pop', 'pattern': '&&subgenres[]=197'},
 {'code': '104', 'name': 'Electropop', 'pattern': '&&subgenres[]=104'},
 {'code': '82', 'name': 'Indie Folk', 'pattern': '&&subg

In [6]:
decades

[{'code': '2010', 'name': '2010s', 'pattern': '&&decade[]=2010'},
 {'code': '2000', 'name': '2000s', 'pattern': '&&decade[]=2000'},
 {'code': '1990', 'name': '1990s', 'pattern': '&&decade[]=1990'},
 {'code': '1980', 'name': '1980s', 'pattern': '&&decade[]=1980'},
 {'code': '1970', 'name': '1970s', 'pattern': '&&decade[]=1970'},
 {'code': '1960', 'name': '1960s', 'pattern': '&&decade[]=1960'},
 {'code': '1950', 'name': '1950s', 'pattern': '&&decade[]=1950'}]

In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

class ParametrisedPageLinkExtractor:
    BASE_URL = "https://www.ultimate-guitar.com/explore?&type[]=Chords";
    
    def __init__(self):
        pass
    
    def get_all_filter_song_links(self,genreFilter,styleFilter,decadeFilter):
        driver = self.create_chrome_driver();
        driver.get(f'{self.BASE_URL}{genreFilter}{styleFilter}{decadeFilter}')
        
        self.click_on_accept_cookies(driver)
        
        soup = BeautifulSoup(driver.page_source, 'lxml')

        return soup.findAll('a', {"class":"_2KJtL _1mes3 kWOod"})
    
    def click_on_accept_cookies(self, driver):
        button = driver.find_element_by_xpath('//button[contains(text(), "thanks")]')

        button.click()  
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver
        

In [9]:
genre = genres[0]
style = styles[0]
decade = decades[0]

extractor = ParametrisedPageLinkExtractor()

links = extractor.get_all_filter_song_links(genre['pattern'], style['pattern'], decade['pattern'])

In [10]:
[ (link['href'],link.contents[0]) for link in links]

[('https://tabs.ultimate-guitar.com/tab/kodaline/all-i-want-chords-1180259',
  'All I Want'),
 ('https://tabs.ultimate-guitar.com/tab/imagine-dragons/radioactive-chords-1171909',
  'Radioactive'),
 ('https://tabs.ultimate-guitar.com/tab/a_great_big_world/say_something_chords_1443639',
  'Say Something (ver\xa03)'),
 ('https://tabs.ultimate-guitar.com/tab/onerepublic/counting-stars-chords-1233464',
  'Counting Stars'),
 ('https://tabs.ultimate-guitar.com/tab/imagine-dragons/believer-chords-1941491',
  'Believer (ver\xa02)'),
 ('https://tabs.ultimate-guitar.com/tab/imagine-dragons/demons-chords-1148110',
  'Demons'),
 ('https://tabs.ultimate-guitar.com/tab/lady-gaga/million-reasons-chords-1884102',
  'Million Reasons'),
 ('https://tabs.ultimate-guitar.com/tab/harry-styles/sign-of-the-times-chords-1977189',
  'Sign Of The Times'),
 ('https://tabs.ultimate-guitar.com/tab/kodaline/high-hopes-chords-1213220',
  'High Hopes (ver\xa02)'),
 ('https://tabs.ultimate-guitar.com/tab/imagine-dragons

## Consider pagination 

We have 50 results per page. Only 20 pages are shown when filtering => 1000 songs per tuple `(genre,style,decade`)

In [13]:
[i for i in range(1,21)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [30]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from functools import reduce

class PaginatedLinkExtractor:
    BASE_URL = "https://www.ultimate-guitar.com/explore?&type[]=Chords";
    MAX_NUM_PAGES = 20
    
    def __init__(self):
        pass
    
    def get_all_filter_song_links(self,genreFilter,styleFilter,decadeFilter):
        list_of_list = [self.get_links_single_page(genreFilter, styleFilter, decadeFilter, f'&page={page}')
                  for page in range(1,self.MAX_NUM_PAGES+1)]
        
        return reduce(lambda list1, list2: [*list1, *list2], list_of_list)
    
    def get_links_single_page(self,genreFilter,styleFilter,decadeFilter, pageFilter):
        driver = self.create_chrome_driver();
        driver.get(f'{self.BASE_URL}{genreFilter}{styleFilter}{decadeFilter}')

        self.click_on_accept_cookies(driver)

        soup = BeautifulSoup(driver.page_source, 'lxml')

        return soup.findAll('a', {"class":"_2KJtL _1mes3 kWOod"})
    
    def click_on_accept_cookies(self, driver):
        button = driver.find_element_by_xpath('//button[contains(text(), "thanks")]')

        button.click()  
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver
        

In [31]:
genre = genres[0]
style = styles[0]
decade = decades[0]

extractor = PaginatedLinkExtractor()

links = extractor.get_all_filter_song_links(genre['pattern'], style['pattern'], decade['pattern'])
all_songs = [ (link['href'],link.contents[0]) for link in links]

In [41]:
def link_to_song_dict(link, genre, style, decade):
    return {
        "name": link.contents[0],
        "url": link['href'],
        "genre": genre["name"],
        "style": style["name"],
        "decade": decade["name"]
    }
    

In [42]:
all_songs = [ link_to_song_dict(link,genre,style,decade) for link in links]

In [43]:
len(all_songs)

1000

In [44]:
all_songs[0]

{'name': 'All I Want',
 'url': 'https://tabs.ultimate-guitar.com/tab/kodaline/all-i-want-chords-1180259',
 'genre': 'Rock',
 'style': 'Pop Rock',
 'decade': '2010s'}