In [5]:
import requests
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep


class justETFScrapper:
    
    def __init__(self, pages_to_scrape=17):
        self.pages_to_scrape = pages_to_scrape
    
    def scrape_urls(self, driver):
        
        #definimos un dataframe vacio sobre el que se concatenaran los dataframes de cada pagina
        df = pd.DataFrame(columns= ['name', 'ISIN', 'Ticker', 'Fund CCY', 'Fund size [M]', 'TER in % p.a.','1Y in %', '3Y in %', '5Y in %', '2020 in %', '2019 in %', '2018 in %','2017 in %'])            
        
        for i in range(1, self.pages_to_scrape+1):#Iteramos sobre todas las paginas con la informacion
            #Obtenemos el codigo HTML de toda la pagina web
            html = driver.page_source
            #Parseamos la información de la pagina web y la guardamos en un DataFrame
            df = pd.concat([df, self.page_parser(html)],ignore_index=True)
            #Pasamos de pagina
            self.get_next_page(driver)
        #La imprimimos
        self.output_results(df)

    def get_next_page(self, driver):
        driver.find_element_by_xpath('//*[@id="etfsTable_next"]').click()
        sleep(1) #Importante sleep que evita que pille el mismo codigo source dos veces seguidas, hay que darle tiempo a la pagina a cargar.
        
    def output_results(self, df):
        df.to_csv('justETF.csv', sep=',', index=False)               
        
    def page_parser(self, html):

        #Convertimos en sopa el codigo HTML de la pagina
        soup = BeautifulSoup(html, 'html.parser')
        #Bucamos la tabla
        table = soup.find(name='div', attrs={'class':'dataTables_scrollBody'})
        #Seleccionamos las filas de la tabla
        table_rows = table.find_all(name='tr', attrs={'role':'row'})[1:]

        #Extraemos las filas y las retornamos en un DataFrame    
        return pd.DataFrame({'name' : [row.find_all(name='td')[1].text  for row in table_rows], #name    
                  'ISIN' : [row.find_all(name='td')[12].text  for row in table_rows], #ISIN
                  'Ticker' : [row.find_all(name='td')[13].text  for row in table_rows], #Ticker    
                'Fund CCY' : [row.find_all(name='td')[2].text  for row in table_rows], #Fund CCY
                'Fund size [M]' : [row.find_all(name='td')[3].text  for row in table_rows], #Fund size [M]
                'TER in % p.a.' : [row.find_all(name='td')[4].text  for row in table_rows], #TER in % p.a.
                '1Y in %' : [row.find_all(name='td')[5].text  for row in table_rows], #1Y in %
                '3Y in %' : [row.find_all(name='td')[6].text  for row in table_rows], #3Y in %
                '5Y in %' : [row.find_all(name='td')[7].text  for row in table_rows], #5Y in %
                '2020 in %' : [row.find_all(name='td')[8].text  for row in table_rows], #2020 in %
                '2019 in %' : [row.find_all(name='td')[9].text  for row in table_rows], #2019 in %
                '2018 in %' : [row.find_all(name='td')[10].text  for row in table_rows], #2018 in %
                '2017 in %' : [row.find_all(name='td')[11].text.replace('%','')  for row in table_rows]}) #2017 in %
    
    def run(self):
        #Cargamos Chrome
        driver = webdriver.Chrome() 
        driver.implicitly_wait(10) #10 segundos de espera para cualquier accion
        driver.get('https://www.justetf.com/en/find-etf.html?groupField=none&sortField=name&sortOrder=asc') #Abrimos la pagina web
        
        sleep(1) #Esperamos un segundo que sino se buguea a veces
        
        #Aceptamos las cookies
        driver.find_element_by_id('CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll').click()

        #Seleccionamos 100 filas
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/a[3]/span').click()
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[3]').click()

        #Desplegamos el menu de opciones
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/a[4]').click()
        #Seleccionamos lo que nos interesa
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[1]/a').click() #Desmarcamos Chart 4 Weeks
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[12]/a').click() #3 in%
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[13]/a').click() #5 in%
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[32]/a').click() #ISIN
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[33]/a').click() #Ticker
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[15]/a').click() #2020%
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[16]/a').click() #2019%
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[17]/a').click() #2018%
        driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[18]/a').click() #2017%

        #Posicionamos el raton sobre 2017 y cliqueamos fuera
        boton_esquina = driver.find_element_by_xpath('//*[@id="buttonsContainer"]/div/div[2]/ul/li[18]/a')
        action = webdriver.common.action_chains.ActionChains(driver)
        action.move_to_element_with_offset(boton_esquina, -25, 0)
        action.click()
        action.perform()

        #Empezamos el scrapeo de URLs
        self.scrape_urls(driver)
        driver.quit()       
        

In [6]:
# Instantiate the IronhackSpider class
justETFScrapper(pages_to_scrape=17).run()