# Extracting more data 

In [11]:
import jl_io as io
from selenium import webdriver
from bs4 import BeautifulSoup
from functools import reduce
import time
import random
import os
import uuid
import pandas as pd 

class SongData:
    SEPARATOR = '^'
    
    def __init__(self, initial_data_path=None, df=None):
        self.df = pd.DataFrame(data=[], columns=['url','name','genre','decade','chords','uuid'])
        
        if initial_data_path is not None and os.path.isfile(initial_data_path):
            self.df = pd.read_csv(initial_data_path, sep=self.SEPARATOR)
        
        if df is not None:
            self.df = df
            
    def add_basic_data(self,basic_data):
        self.df = self.df.append(basic_data,ignore_index=True)

    def add_details(self,details):
        self.df.loc[self.df['url'] == details["url"], ["chords"]] = str(details["chords"])
        self.df.loc[self.df['url'] == details["url"], ["uuid"]] = details["uuid"]
        
    def has_basic_data(self,url):
        return (self.df['url'] == url).any()
    
    def has_chords(self,url):
        return ((self.df['url'] == url) & (self.df['chords'].notnull())).any()
        
    def get_chords(self,url):
        return eval( self.df[self.df['url'] == url]['chords'][0])
    
    def has_genre_and_decade(self, genre, decade):
        return ((self.df['genre'] == genre) & (self.df['decade'] == decade)).any()
    
    def save(self, path):
        self.df.to_csv(path,index=False,sep=self.SEPARATOR)

class ChordExtractor:

    def __init__(self, raw_html_output_directory):
        self.raw_html_output_directory = raw_html_output_directory
        self.driver = self.create_chrome_driver()
        self.first_time = True
        
        if not os.path.isdir(self.raw_html_output_directory):
            os.mkdir(self.raw_html_output_directory)
            
    def extract_song_data(self,url):
        chords_spans = self.get_chord_spans(url)
        
        chords = [span.decode_contents() for span in chords_spans]
        
        song_uuid = str(uuid.uuid4())
        with open(f"{self.raw_html_output_directory}/{song_uuid}.html", "w") as file: # De los datos,como del cerdo, se guarda todo.
            file.write(self.driver.page_source )
    
        info = {
            "url":url,
            "chords":chords,
            "uuid":song_uuid
        }
        
        return info
    
    def get_chord_spans(self,url):
        self.driver.get(url)
        
        if self.driver.page_source == '<html><head></head><body></body></html>':
            raise Exception('Denegation error')

        if self.first_time:
            self.click_on_accept_cookies()
            self.first_time = False

        soup = BeautifulSoup(self.driver.page_source, 'lxml')

        article = soup.findAll('article')[3];
        
        return article.findAll('span', {"style":"color: rgb(0, 0, 0);"})
    
    def click_on_accept_cookies(self):
        try:
            button = self.driver.find_element_by_xpath('//button[contains(text(), "thanks")]')

            button.click()
        except:
            print('cookies banner not found. Ignored')
            
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver

class LinkExtractor:
    BASE_URL = "https://www.ultimate-guitar.com/explore?&type[]=Chords";
    
    def __init__(self):
        self.driver = self.create_chrome_driver()
        self.first_time = True
    
    def get_all_songs(self, genre, decade, first_page, last_page):
        links = self.get_all_filter_song_links(genre['pattern'],decade['pattern'], first_page, last_page)
        result = [ self.link_to_song_dict(link,genre,decade) for link in links]
        
        return result
    
    def link_to_song_dict(self,link, genre, decade):
        return {
            "name": link.contents[0],
            "url": link['href'],
            "genre": genre["name"],
            "decade": decade["name"]
        }
    
    def get_all_filter_song_links(self,genreFilter,decadeFilter, first_page, last_page):
        list_of_list = [self.get_links_single_page(genreFilter, decadeFilter, f'&page={page}')
                  for page in range(first_page,last_page + 1)]
        
        return reduce(lambda list1, list2: [*list1, *list2], list_of_list)
    
    def get_links_single_page(self,genreFilter,decadeFilter, pageFilter):
        self.driver.get(f'{self.BASE_URL}{genreFilter}{decadeFilter}{pageFilter}')
        
        if self.driver.page_source == '<html><head></head><body></body></html>':
            raise Exception('Denegation error')

        if self.first_time:
            self.click_on_accept_cookies()
            self.first_time = False

        soup = BeautifulSoup(self.driver.page_source, 'lxml')

        return soup.findAll('a', {"class":"_2KJtL _1mes3 kWOod"})
    
    def click_on_accept_cookies(self):
        try:
            button = self.driver.find_element_by_xpath('//button[contains(text(), "thanks")]')

            button.click()
        except:
            print('cookies banner not found. Ignored')
            
    
    def create_chrome_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')
        
        driver = webdriver.Chrome("./chromedriver", options=options)
        return driver

class ChordScraper:
    def __init__(self, linkExtractor, chordExtractor, genres, decades, song_data):
        self.linkExtractor = linkExtractor
        self.chordExtractor = chordExtractor
        self.genres = genres
        self.decades = decades
        self.song_data = song_data
        
        self.combinations = []

        for decade in self.decades:
            for genre in self.genres:
                    self.combinations.append( {"decade": decade, "genre": genre})
        
    def extract(self, first_page, last_page):      
        startIndex = 1
        
        return self.extract_from(startIndex, first_page, last_page)
    
    def extract_from(self, startIndexBase1,first_page, last_page):    
        for index, combination in enumerate(self.combinations[startIndexBase1 - 1:]):
            new_extracted_songs = []
            genre = combination["genre"]
            decade = combination["decade"]
            
            if song_data.has_genre_and_decade(genre["name"], decade["name"]):
                print(f'{genre["name"]},{decade["name"]} already extracted')
                continue
            
            song_basic_data_array = []          
            try:
                song_basic_data_array = self.linkExtractor.get_all_songs(genre,decade, first_page, last_page)
                      
                for basic_data in song_basic_data_array:
                    self.song_data.add_basic_data(basic_data)
                      
                new_extracted_songs = [self.extract_song_data(index,song) for index,song in enumerate(new_extracted_songs)]
            except Exception as e: 
                print(f'Error in ({genre["name"]},{decade["name"]})')
                raise e
            
            for index,song in enumerate(song_basic_data_array):
                try:
                    self.extract_song_data(index,song)
                except Exception as e: 
                    print(f'Error in "{song["name"]}"')
                    raise e
            
            number_of_songs = len(self.song_data.df)
            print(f'Extracted {index+startIndexBase1} of {len(self.combinations)} ({genre["name"]},{decade["name"]}). {number_of_songs} in total')  
        
    def extract_song_data(self,index,song): 
        if self.song_data.has_chords(song["url"]):
            print(f'Song {index}. "{song["name"]}" already extracted')  
            return
      
        song_details = self.chordExtractor.extract_song_data(song["url"])           
        self.song_data.add_details(song_details)
        print(f'Extracted data from song {index}. "{song["name"]}"')



##  Extract

In [12]:
import jl_io as io

genres = io.from_file('../data/genres')
decades = io.from_file('../data/decades')

In [16]:
RAW_HTML_OUTPUT = '../data/raw_html'
CHORDS_DATASET_OUTPUT = "../data/chords2.csv"

In [17]:
song_data = SongData()

In [25]:
chord_extractor = ChordExtractor(RAW_HTML_OUTPUT)
link_extractor = LinkExtractor()
chord_scraper = ChordScraper(link_extractor,chord_extractor,genres,decades,song_data)

In [28]:
chord_scraper.extract(2,2)

Rock,2010s already extracted
Folk,2010s already extracted
Pop,2010s already extracted
Country,2010s already extracted
Electronic,2010s already extracted
Rhythm And Blues,2010s already extracted
Metal,2010s already extracted
Contemporary R&b,2010s already extracted
Religious Music,2010s already extracted
Hip Hop,2010s already extracted
Reggae,2010s already extracted
Jazz,2010s already extracted
Blues,2010s already extracted
Extracted 14 of 147 (World Music,2010s). 3174 in total
Disco,2010s already extracted
Extracted 16 of 147 (Comedy,2010s). 3174 in total
Extracted 17 of 147 (New Age,2010s). 3174 in total
Extracted 18 of 147 (Classical,2010s). 3174 in total
Extracted 19 of 147 (Experimental,2010s). 3174 in total
Extracted 20 of 147 (Darkwave,2010s). 3174 in total
Extracted 21 of 147 (Soundtrack,2010s). 3174 in total
Rock,2000s already extracted
Folk,2000s already extracted
Pop,2000s already extracted
Country,2000s already extracted
Electronic,2000s already extracted
Rhythm And Blues,20

Extracted data from song 5. "Ramblin Man (ver 2)"
Song 6. "Folsom Prison Blues (ver 5)" already extracted
Extracted data from song 7. "I Got Stripes (ver 2)"
Extracted data from song 8. "Sixteen Tons (ver 4)"
Extracted data from song 9. "Baby Were Really In Love"
Song 10. "Crazy Arms" already extracted
Extracted data from song 11. "Howlin At The Moon"
Extracted data from song 12. "Bird Dog (ver 3)"
Extracted data from song 13. "Walking After Midnight"
Extracted data from song 14. "Walking After Midnight (ver 3)"
Extracted data from song 15. "Lonesome Whistle"
Extracted data from song 16. "Cool Water"
Extracted data from song 17. "Take These Chains From My Heart"
Extracted data from song 18. "Big Iron (ver 4)"
Song 19. "Folsom Prison Blues (ver 11)" already extracted
Song 20. "Folsom Prison Blues (ver 12)" already extracted
Extracted data from song 21. "Sixteen Tons"
Extracted data from song 22. "Sixteen Tons (ver 5)"
Extracted data from song 23. "The Battle Of New Orleans (ver 3)"
Extr

Extracted 140 of 147 (World Music,1950s). 3374 in total
Extracted 141 of 147 (Disco,1950s). 3374 in total
Extracted 142 of 147 (Comedy,1950s). 3374 in total
Extracted 143 of 147 (New Age,1950s). 3374 in total
Extracted 144 of 147 (Classical,1950s). 3374 in total
Extracted 145 of 147 (Experimental,1950s). 3374 in total
Extracted 146 of 147 (Darkwave,1950s). 3374 in total
Extracted 147 of 147 (Soundtrack,1950s). 3374 in total


In [29]:
song_data.df

Unnamed: 0,url,name,genre,decade,chords,uuid
0,https://tabs.ultimate-guitar.com/tab/5_seconds...,Lie To Me,Rock,2010s,"['C', 'G', 'Am', 'F', 'C', 'G', 'Am', 'F', 'C'...",dcee9fab-15b8-414e-aa07-a87b27f007e3
1,https://tabs.ultimate-guitar.com/tab/keith-urb...,Blue Aint Your Color,Rock,2010s,"['G', 'Am', 'C', 'D', 'G', 'G', 'Am', 'C', 'D'...",f86ca365-1710-4943-be4a-366708124d58
2,https://tabs.ultimate-guitar.com/tab/hozier/wo...,Work Song,Rock,2010s,"['Bb', 'Cm', 'Bb', 'Cm', 'Bb', 'Cm', 'Bb', 'Cm...",a4187f1b-3fb0-4277-8b33-4f9dc26116af
3,https://tabs.ultimate-guitar.com/tab/arctic_mo...,Do I Wanna Know,Rock,2010s,"['Gm', 'Eb', 'Cm', 'Gm', 'Gm', 'Eb', 'Cm', 'Gm...",d75330a7-8e9a-409b-8b88-68cb5f357f7e
4,https://tabs.ultimate-guitar.com/tab/blackberr...,One Horse Town (ver 3),Rock,2010s,"['Am', 'C', 'Dm', 'F', 'C', 'G', 'Dm', 'F', 'C...",84075e00-a22c-47ee-9e21-2b2b09841e6e
...,...,...,...,...,...,...
3369,https://tabs.ultimate-guitar.com/tab/ella-fitz...,Dream A Little Dream Of Me (ver 5),Jazz,1950s,"['Ebdim7', 'Fm7', 'Bb7', 'Eb', 'Ebdim7', 'C7',...",9cefb6b0-f6c5-46e6-a4c5-78a0e71046ba
3370,https://tabs.ultimate-guitar.com/tab/the_ink_s...,I Dont Want To Set The World On Fire (ver 3),Jazz,1950s,"['B7', 'E', 'E/Ab', 'Gdim', 'Fm', 'E', 'E/Ab',...",37ca5e4a-92a6-4226-b829-e99f32217866
3371,https://tabs.ultimate-guitar.com/tab/ella-fitz...,Summertime,Jazz,1950s,"['Amaj7', 'F#m', 'G#maj7', 'C#m', 'C#m', 'F#m'...",8783061f-2296-45f5-a9ea-71a788f100c7
3372,https://tabs.ultimate-guitar.com/tab/tony_benn...,The Way You Look Tonight,Jazz,1950s,"['Bbmaj9', 'G13', 'Cm7', 'F7', 'Bb', 'Gm11', '...",f1806027-37c5-4921-a2f0-cea6e86bb756


In [43]:
song_data.save('../data/chords_2.csv')

## Cleaning 

In [5]:
from jl_chord_cleaning import ChordDatasetCleaner
from jl_song_data import SongData

In [6]:
RAW_HTML_OUTPUT = '../data/raw_html'
song_data = SongData(initial_data_path='../data/chords_2.csv')

In [7]:
song_data.df

Unnamed: 0,url,name,genre,decade,chords,uuid
0,https://tabs.ultimate-guitar.com/tab/5_seconds...,Lie To Me,Rock,2010s,"['C', 'G', 'Am', 'F', 'C', 'G', 'Am', 'F', 'C'...",dcee9fab-15b8-414e-aa07-a87b27f007e3
1,https://tabs.ultimate-guitar.com/tab/keith-urb...,Blue Aint Your Color,Rock,2010s,"['G', 'Am', 'C', 'D', 'G', 'G', 'Am', 'C', 'D'...",f86ca365-1710-4943-be4a-366708124d58
2,https://tabs.ultimate-guitar.com/tab/hozier/wo...,Work Song,Rock,2010s,"['Bb', 'Cm', 'Bb', 'Cm', 'Bb', 'Cm', 'Bb', 'Cm...",a4187f1b-3fb0-4277-8b33-4f9dc26116af
3,https://tabs.ultimate-guitar.com/tab/arctic_mo...,Do I Wanna Know,Rock,2010s,"['Gm', 'Eb', 'Cm', 'Gm', 'Gm', 'Eb', 'Cm', 'Gm...",d75330a7-8e9a-409b-8b88-68cb5f357f7e
4,https://tabs.ultimate-guitar.com/tab/blackberr...,One Horse Town (ver 3),Rock,2010s,"['Am', 'C', 'Dm', 'F', 'C', 'G', 'Dm', 'F', 'C...",84075e00-a22c-47ee-9e21-2b2b09841e6e
...,...,...,...,...,...,...
3369,https://tabs.ultimate-guitar.com/tab/ella-fitz...,Dream A Little Dream Of Me (ver 5),Jazz,1950s,"['Ebdim7', 'Fm7', 'Bb7', 'Eb', 'Ebdim7', 'C7',...",9cefb6b0-f6c5-46e6-a4c5-78a0e71046ba
3370,https://tabs.ultimate-guitar.com/tab/the_ink_s...,I Dont Want To Set The World On Fire (ver 3),Jazz,1950s,"['B7', 'E', 'E/Ab', 'Gdim', 'Fm', 'E', 'E/Ab',...",37ca5e4a-92a6-4226-b829-e99f32217866
3371,https://tabs.ultimate-guitar.com/tab/ella-fitz...,Summertime,Jazz,1950s,"['Amaj7', 'F#m', 'G#maj7', 'C#m', 'C#m', 'F#m'...",8783061f-2296-45f5-a9ea-71a788f100c7
3372,https://tabs.ultimate-guitar.com/tab/tony_benn...,The Way You Look Tonight,Jazz,1950s,"['Bbmaj9', 'G13', 'Cm7', 'F7', 'Bb', 'Gm11', '...",f1806027-37c5-4921-a2f0-cea6e86bb756


In [8]:
cleaner = ChordDatasetCleaner()
clean_dataset = cleaner.clean_dataset(song_data.df, RAW_HTML_OUTPUT)

Re extracting chords for 121 songs
https://tabs.ultimate-guitar.com/tab/1060259


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[self.df['url'] == details["url"], ["chords"]] = str(details["chords"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[self.df['url'] == details["url"], ["uuid"]] = details["uuid"]


https://tabs.ultimate-guitar.com/tab/2882276
https://tabs.ultimate-guitar.com/tab/711170
https://tabs.ultimate-guitar.com/tab/abba/the_winner_takes_it_all_chords_211929
https://tabs.ultimate-guitar.com/tab/air_supply/having_you_near_me_chords_1642252
https://tabs.ultimate-guitar.com/tab/arcade_fire/wake_up_chords_532785
https://tabs.ultimate-guitar.com/tab/backstreet_boys/shape_of_my_heart_chords_17642
https://tabs.ultimate-guitar.com/tab/belle_and_sebastian/piazza_new_york_catcher_chords_476509
https://tabs.ultimate-guitar.com/tab/billy_joel/we_didnt_start_the_fire_chords_1088942
https://tabs.ultimate-guitar.com/tab/billy_joel/you_may_be_right_chords_1672479
https://tabs.ultimate-guitar.com/tab/black_uk/wonderful_life_chords_830823
https://tabs.ultimate-guitar.com/tab/burl_ives/aunt_rhody_chords_1990453
https://tabs.ultimate-guitar.com/tab/carla-bruni/quelquun-ma-dit-chords-1726373
https://tabs.ultimate-guitar.com/tab/cher/if-i-could-turn-back-time-chords-1090135
https://tabs.ultimate

https://tabs.ultimate-guitar.com/tab/the_rolling_stones/all_of_your_love_chords_1931537
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/blue_and_lonesome_chords_1926561
Error in https://tabs.ultimate-guitar.com/tab/the_rolling_stones/blue_and_lonesome_chords_1926561. list index out of range
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/everybody_knows_about_my_good_thing_chords_1910053
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/hoo_doo_blues_chords_1931545
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/i_gotta_go_chords_1931541
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/just_like_i_treat_you_chords_1931551
https://tabs.ultimate-guitar.com/tab/the_rolling_stones/little_rain_chords_1931549
https://tabs.ultimate-guitar.com/tab/tina_turner/whats_love_got_to_do_with_it_chords_1397769
https://tabs.ultimate-guitar.com/tab/vanessa_carlton/a_thousand_miles_chords_1093797
https://tabs.ultimate-guitar.com/tab/whitney_houston/how_will_i_know_cho

In [9]:
clean_dataset

Unnamed: 0,url,name,decade,genre,chords,uuid
0,https://tabs.ultimate-guitar.com/tab/1055161,Time To Say Goodbye Con Te Partirò,1990s,Pop,"['G', 'D', 'Em', 'C', 'G', 'D', 'Em', 'C', 'G'...",3983e77a-d9b2-461e-8029-9d249dfd0947
2,https://tabs.ultimate-guitar.com/tab/10cc/im_n...,Im Not In Love,1970s,Pop,"['F#m7/B', 'B6', 'F#m7/B', 'B6', 'F#m7/B', 'B6...",e69883a2-97c0-4b04-98fa-ae8a7eb71e00
3,https://tabs.ultimate-guitar.com/tab/1136507,Clown,2010s,Rhythm And Blues%%Contemporary R&b,"['Am', 'D/F#', 'G', 'Am', 'D/F#', 'G', 'Am', '...",847a9a07-9773-4eab-a5c6-6e0cbe7f7357
4,https://tabs.ultimate-guitar.com/tab/1136522,Clown (ver 2),2010s,Contemporary R&b,"['Am', 'D', 'G', 'Am', 'D', 'G', 'Am', 'D', 'G...",e68c8fa3-2cfd-4889-abba-1ffbf0869a37
5,https://tabs.ultimate-guitar.com/tab/128929,Balladen Om Herr Fredrik Åkare Och Den Söta Fr...,1960s,Jazz,"['Dm', 'E', 'Am', 'Dm', 'G7', 'C', 'E', 'Am', ...",e318df19-031d-4a48-9c10-d502a87b5bae
...,...,...,...,...,...,...
3205,https://tabs.ultimate-guitar.com/tab/woody_gut...,Hard Aint It Hard (ver 2),1950s,Folk,"['C', 'F', 'C', 'G7', 'C', 'F', 'C', 'G7', 'C'...",4ab14537-d44d-4701-a95e-9088eeaa0330
3206,https://tabs.ultimate-guitar.com/tab/woody_gut...,Pretty Boy Floyd (ver 2),1950s,Folk,"['E', 'E7', 'A', 'E', 'A', 'E', 'B7', 'E', 'E'...",bac64960-0676-47b3-bff7-dffdb42ebbcf
3207,https://tabs.ultimate-guitar.com/tab/woody_gut...,Talking Dust Bowl Blues,1950s,Folk,"['A', 'D', 'E', 'A', 'D', 'E', 'A', 'D', 'E', ...",79bb91ac-e469-4053-ae5b-22e7cd413ff1
3208,https://tabs.ultimate-guitar.com/tab/woody_gut...,Vigilante Man,1950s,Folk,"['C', 'C', 'C', 'C', 'F', 'F', 'F', 'C', 'F', ...",f0eec3df-c170-4884-a884-624d0ffc9f48


In [10]:
SongData(df=clean_dataset).save('../data/chords_clean_2.csv')