In [None]:
import navigation as nav
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO
import time
from datetime import datetime

In [None]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [None]:
url = nav.all_datasets['webscraping']['selic_historico']
url

In [None]:
driver.get(url)
time.sleep(10)
driver.title

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup

In [None]:
table = soup.find('table', {'id': "historicotaxasjuros"})
table

In [None]:
html_string = str(table)
html_string = html_string.replace(',', '.')
df = pd.read_html(StringIO(html_string), header=1)[0]
df.dtypes

In [None]:
df.tail()

In [None]:
col_rename = [
    'numero',
    'data',
    'vies',
    'periodo_vigencia',
    'meta_selic',
    'TBAN',
    'taxa_mensal',
    'taxa_acumulado_ano'
]
df.columns = col_rename
df.info()

In [None]:
df['numero'] = df['numero'].str.replace('Pres. (9)','')
df['numero'] = df['numero'].str.replace(' (7)','')
df['numero'] = df['numero'].str.replace(' ex. (8)','')
df['numero'] = df['numero'].str.replace('ª','')
df['numero'] = pd.to_numeric(df['numero'], errors='coerce').fillna(-1).astype('Int64')
df['numero'].head()

In [None]:
df['numero'].tail(20)

In [None]:
def dt_treatment(row):
    try:
        return pd.to_datetime(row, dayfirst=True)
    except ValueError as e:
        print(f'Error on {row}: {e}')
        return row

df['data'] = df['data'].map(dt_treatment)
df['data']

In [None]:
def split_time_interval(row):
    dates = row.split('-')
    data_init = (dates[0].strip())
    data_fim = (dates[1].strip()) if dates[1] else str(datetime.today().date())#.strftime('%d/%m/%Y')
    return pd.Series([data_init, data_fim])

df[['data_init', 'data_fim']] = df['periodo_vigencia'].apply(split_time_interval)
df[['data_init', 'data_fim']] = df[['data_init', 'data_fim']].map(dt_treatment)
cols1 = ['periodo_vigencia','data_init', 'data_fim']
df[cols1]

In [None]:
keep = [
    'numero',
    'data',
    'data_init',
    'data_fim',
    'taxa_mensal',
    'taxa_acumulado_ano',
    'meta_selic'
]
df = df[keep]
df.head()