In [1]:

import pandas as pd
import pandas_gbq

from google.oauth2 import service_account

key_path = "./app/gkeys/epidemicapp-62d0d471b86f.json"
CREDENTIALS  = service_account.Credentials.from_service_account_file(key_path)
pandas_gbq.context.credentials = CREDENTIALS

In [2]:

query_data = """
    select *
    from br_general.cities_coordinates
"""

cities_df = pandas_gbq.read_gbq(query_data, project_id="epidemicapp-280600")
cities_df.head()

Downloading: 100%|██████████| 2540/2540 [00:00<00:00, 2980.56rows/s]


Unnamed: 0,city,state,country,lat,long
0,Rio Branco/AC,AC,Brasil,-9.976536,-67.822078
1,Plácido de Castro/AC,AC,Brasil,-10.323915,-67.18242
2,Cruzeiro do Sul/AC,AC,Brasil,-7.630796,-72.670387
3,Acrelândia/AC,AC,Brasil,-10.075917,-67.05269
4,Senador Guiomard/AC,AC,Brasil,-9.9765,-67.31914


In [32]:

import unidecode
import random
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

options = webdriver.ChromeOptions()
# options.add_argument('headless')
# options.add_argument('window-size=1920x1080')

driver = webdriver.Chrome("./app/gdriver/chromedriver", options=options)

In [33]:

dummie_url = ["https://www.linkedin.com/feed/", 
              "https://github.com/",
              "https://stackoverflow.com/",
              "https://twitter.com/home?lang=pt",
              "https://pt-br.facebook.com/"]
driver.get(dummie_url[0])


In [None]:
verbose_progress = True
verbose_pipeline = True
verbose_content = False
verbose_exceptions = False

counter_flag_control = 20
cities_list = cities_df["city"].unique().tolist()
social_scrap_url = "https://cidades.ibge.gov.br/brasil/{}/{}/panorama/"

df_tables = dict()

print("Running the scrapper...")
for city_name in cities_list:
    # Reinitializing the process flags
    concluded_flag = False
    test_flag = False
    test_counter = 0
    
    # Getting the state and city name from id
    city, state = city_name.replace(" ", "-").lower().split("/")
    # Creating the unicode version of the city and state
    city, state = unidecode.unidecode(city), unidecode.unidecode(state)
    # Create the scrapping url to fetch the data
    city_scrap_url = social_scrap_url.format(state, city)
    # Print the total progress
    if verbose_progress:
        print("  At => {} from {} -> {} in url => {}".format(
            cities_list.index(city_name), len(cities_list), city_name, city_scrap_url))
    
    # While no data is fetched, try getting 
    # the site response and the data
    while not concluded_flag and not test_flag:
        # Get the site response...
        if verbose_pipeline:
            print("    . Getting the url content...")
        driver.get(dummie_url[random.randint(0, 4)])
        driver.get(city_scrap_url)
        # Append the new actions to driver
        if verbose_pipeline:
            print("    . Including the actions module...")
        actions = ActionChains(driver)
        
        # Find the list headers to collected the data...
        p_elements = driver.find_elements_by_class_name("lista__cabecalho")
        if verbose_pipeline:
            print("    . Getting the lista elements --> {} elements!".format(len(p_elements)))
        
        # For each head element collect all data
        for element in p_elements:
            if verbose_content:
                print("\t\t -> At element: {}".format(element.text))
            if element.text not in df_tables.keys():
                df_tables[element.text] = pd.DataFrame()
            # Click on the element
            if element != p_elements[0] and test_counter == 0:
                actions.move_to_element(element)
                try:
                    actions.click(element)
                    actions.perform()
                except Exception as e:
                    print("\t\t => {}".format(e))
            # Find the element content
            name_elements = driver.find_elements_by_class_name("lista__nome")
            value_elements = driver.find_elements_by_class_name("lista__valor")
            if verbose_pipeline and element == p_elements[0]:
                print("    . Getting close to {} names and {} values".format(
                    len(name_elements), len(value_elements)))
            # Print the elements content
            new_city_content = dict()
            for n, v in zip(name_elements, value_elements):
                if n.text != "" and v.text != "":
                    # Create the numerical component
                    split_value = v.text.split(" ")
                    numeric_val = split_value[0].replace(".", "").replace(",", ".")
                    try:
                        numeric_val = float(numeric_val)
                        # Create the field text
                        split_name = n.text.split(" ")
                        split_name.pop(-1)
                        name_val = "_".join(split_name)
                        name_val = unidecode.unidecode(name_val.lower())
                        if verbose_content:
                            print("\t\t Item: {} => {}".format(n.text, v.text))
                            print("\t\t   * {} => {}".format(name_val, numeric_val))
                        if "%" in v.text.split(" "):
                            numeric_val *= 1/100.0
                            name_val = name_val + "_perc"
                        new_city_content[name_val] = [numeric_val]
                    except Exception as e:
                        if verbose_exceptions:
                            print(e)
            
            new_df = pd.DataFrame(new_city_content)
            if len(new_df) > 0:
                concluded_flag = True
                
            new_df["city"] = city_name
            df_tables[element.text] = pd.concat((df_tables[element.text], new_df))
            
        test_counter += 1
        if verbose_progress:
            print("\t\t\t => test {}".format(test_counter))
        if test_counter >= counter_flag_control:
            test_flag = True
            print("\t\t  => Flag error... tryed {} times...".format(counter_flag_control))


Running the scrapper...
  At => 0 from 2540 -> Rio Branco/AC in url => https://cidades.ibge.gov.br/brasil/ac/rio-branco/panorama/
    . Getting the url content...
    . Including the actions module...
    . Getting the lista elements --> 0 elements!
			 => test 1
    . Getting the url content...
    . Including the actions module...
    . Getting the lista elements --> 0 elements!
			 => test 2
    . Getting the url content...
    . Including the actions module...
    . Getting the lista elements --> 0 elements!
			 => test 3
    . Getting the url content...
    . Including the actions module...
    . Getting the lista elements --> 0 elements!
			 => test 4
    . Getting the url content...
    . Including the actions module...
    . Getting the lista elements --> 6 elements!
    . Getting close to 30 names and 30 values
			 => test 5
  At => 1 from 2540 -> Plácido de Castro/AC in url => https://cidades.ibge.gov.br/brasil/ac/placido-de-castro/panorama/
    . Getting the url content...
 

In [28]:


df_tables['POPULAÇÃO']


Unnamed: 0,populacao_estimada,populacao_no_ultimo_censo,densidade_demografica,city,taxa_de_fecundidade,taxa_de_mortalidade_infantil,domicilios_com_iluminacao_eletrica_perc,domicilios_com_lixo_coletado_diretamente_perc,domicilios_com_rede_geral_como_principal_forma_de_abastecimento_de_agua_perc,domicilios_com_esgotamento_sanitario_(rede_geral_ou_fossa_septica_ligada_a_rede)_perc,domicilios_com_microcomputador_ou_tablet_perc,domicilios_com_acesso_a_internet_perc,domicilios_com_telefone_movel_celular_perc,domicilios_com_televisao_perc,pessoas_de_15_anos_ou_mais_que_praticaram_atividade_fisica_perc
0,407319.0,336038.0,38.03,Rio Branco/AC,,,,,,,,,,,
0,19761.0,17209.0,8.86,Plácido de Castro/AC,,,,,,,,,,,
0,88376.0,78507.0,8.94,Cruzeiro do Sul/AC,,,,,,,,,,,
0,15256.0,12538.0,6.94,Acrelândia/AC,,,,,,,,,,,
0,23024.0,20179.0,8.69,Senador Guiomard/AC,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,17852.0,18250.0,206.33,Lagoa da Canoa/AL,,,,,,,,,,,
0,12690.0,12060.0,51.67,Novo Lino/AL,,,,,,,,,,,
0,6664.0,6656.0,51.48,Roteiro/AL,,,,,,,,,,,
0,28635.0,26992.0,85.48,Limoeiro de Anadia/AL,,,,,,,,,,,


In [22]:
df_tables.keys()

dict_keys(['POPULAÇÃO', 'TRABALHO E RENDIMENTO', 'EDUCAÇÃO', 'ECONOMIA', 'SAÚDE', 'TERRITÓRIO E AMBIENTE', 'INDÚSTRIA'])

In [None]:
p_elements = driver.find_elements_by_class_name("lista__titulo")

for element in p_elements:
    if element.text != "":
        print(element.text)

In [None]:
p_elements = driver.find_elements_by_class_name("lista__indicador")

for element in p_elements:
    if element.text != "":
        print(element.text)


In [None]:
city_scrap_url