# **SIVEP-Gripe** pipeline

Pilot project for reporting automation.

**Case study:** Collect notifications of Severe acute respiratory syndrome (SARS) from the official national registry SIVEP-Gripe using Selenium. 
Notifications are considered for the city of Fortaleza, Ceará. 

**Tools**:<br>
    - Selenium for automatic download of the DBF data;<br>
    - SQLite and SQLAlchemy to store main data;<br>
    - Dash for dynamic reporting via dashboards;<br>

**Author**: Higor S. Monteiro<br>
**Date**: 10 Jul 2023<br>

## Lib

In [1]:
import os
import csv
import time
import overpy
import osmnx as ox
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from collections import defaultdict

In [10]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select

In [4]:
from sqlalchemy import create_engine
from sqlalchemy import select, insert, update, text
from sqlalchemy import Table, MetaData
from sqlalchemy import Column, DateTime, Integer, Numeric, String, Sequence, ForeignKey

In [5]:
#import sys
#sys.path.append(os.path.join(".."))

In [65]:
#from spellgeo_cevepi.collectors.zipcode_collector import ZipCollector

## **Object definitions**

In [5]:
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : '/path/to/dir'}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)

class SivepPipe:
    def __init__(self, certification, download_folder=None, headless=False):
        '''
        
        '''
        self._usernm = certification['username']
        self._passwd = certification['password']
        
        self.driver = None
        self.url = "https://sivepgripe.saude.gov.br/sivepgripe/login.html?0"
        self.download_folder = download_folder
        self.browser_options = webdriver.ChromeOptions()
        if headless:
            self.browser_options.add_argument("--headless=new")
        self.browser_options.add_experimental_option("prefs", 
            {
                "download.default_directory": download_folder,
                "download.prompt_for_download": False,
            })
            
        
    @property
    def username(self):
        return self._usernm

    @username.setter
    def username(self, x):
        raise Exception('Cannot change username')
        
    def close_browser(self):
        self.driver.close()
        self.driver = None
    
    def main_page(self):
        '''
        
        '''
        if self.driver is None:
            self.driver = webdriver.Chrome(options=self.browser_options)
        # -> locate and clear the login fields
        self.driver.get(self.url)
        usernm_form =  WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.ID, 'BoxAreaLogin_campo_email'))).find_element(By.TAG_NAME, 'input')
        passwd_form =  WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.ID, 'BoxAreaLogin_campo_senha'))).find_element(By.TAG_NAME, 'input')
        usernm_form.clear()
        passwd_form.clear()
        # -> send certification
        usernm_form.send_keys(self._usernm)
        passwd_form.send_keys(self._passwd)
        enter_field =  WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.NAME, 'ENTRAR')))
        self.driver.execute_script('arguments[0].click()', enter_field)
        
        # -> Checker whether there is a dialog box to close before interacting with the page.
        try:
            dialog_div = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located(( By.XPATH, './/div[@class = "ui-dialog-buttonset"]' )))
            dialog_button = dialog_div.find_element(By.TAG_NAME, 'button')
            self.driver.execute_script('arguments[0].click()', dialog_button)
        except:
            pass
        
    def homepage(self):
        '''
            From any page in the website, return to the homepage.
        '''
        elements = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located(( By.XPATH, './/div[@id = "BoxConteudoPrincipal_migalha"]' ))).find_elements(By.TAG_NAME, 'a')
        for elem in elements:
            if 'principal' in elem.get_attribute('text').lower():
                self.driver.execute_script('arguments[0].click()', elem)
                break
                
        # -> Checker whether there is a dialog box to close before interacting with the page.
        try:
            dialog_div = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located(( By.XPATH, './/div[@class = "ui-dialog-buttonset"]' )))
            dialog_button = dialog_div.find_element(By.TAG_NAME, 'button')
            self.driver.execute_script('arguments[0].click()', dialog_button)
        except:
            pass
    
    def locate_page(self):
        '''
            Return the absolute path of the current page from the homepage.
        '''
        elements = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located(( By.XPATH, './/div[@id = "BoxConteudoPrincipal_migalha"]' ))).find_elements(By.TAG_NAME, 'a')
        page_path = '\\'.join([ a.get_attribute('text').strip().replace('\n','').replace('\t','') for a in elements ])
        print(page_path)
        
    def request_dbf(self, epi_year, epi_week_first, epi_week_last, patient_data=True):
        '''
            Request the DBF file for the severe acute respiratory syndrome notifications.
        '''
        # --> From main page, access the page for requesting data
        a = WebDriverWait(self.driver, 20.0).until(EC.presence_of_all_elements_located((By.XPATH, './/a[@alt = "REGISTROS INDIVIDUAIS"]')))
        self.driver.execute_script('arguments[0].click()', a[1]) # The second is the one for the new database.
        
        selectElem = Select(WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/select[@name = "tipoFicha"]'))))
        selectElem.select_by_visible_text("SRAG Hospitalizado")
    
        # --> select radio box for epi year
        radio_epi = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:anoEpidemiologico"]')))
        self.driver.execute_script('arguments[0].click()', radio_epi)
        
        # --> Checkbox for patient's data
        if patient_data:
            dados_pac_check = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "chkExportarDadosPaciente"]')))
            self.driver.execute_script('arguments[0].click()', dados_pac_check)
        
        # --> Fill up information (test for staleness)
        input_ano = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:anoAnoEpidemiologico"]')))
        # --> Wait for staleness of forms (triggered by radio_epi)
        while True:
            try:
                dummy = input_ano.get_attribute("text")
            except StaleElementReferenceException:
                break
                
        input_ano = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:anoAnoEpidemiologico"]')))
        input_inicial = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:semanaInicial"]')))
        input_final = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:semanaFinal"]')))
        self.driver.execute_script('arguments[0].value=arguments[1];', input_ano, epi_year)
        self.driver.execute_script('arguments[0].value=arguments[1];', input_inicial, epi_week_first)
        self.driver.execute_script('arguments[0].value=arguments[1];', input_final, epi_week_last)
        
        # --> Request data
        gerar_dbf = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "gerarDbf"]')))
        self.driver.execute_script('arguments[0].click()', gerar_dbf)
        
        msg_success = WebDriverWait(self.driver, 60.0).until(EC.presence_of_element_located((By.XPATH, './/span[@class = "msgSUCCESS"]')))
        msgtext = BeautifulSoup(msg_success.get_attribute('outerHTML'), 'html.parser').get_text()
        requisition_number = msgtext.split(":")[1].strip().replace(".","")
        return requisition_number
        
    def query_file(self, requisicao=None):
        '''
        
        '''
        # --> From main page, access the page to query the data
        a = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/a[@alt = "CONSULTAR EXPORTAÇÕES DBF"]')))
        self.driver.execute_script('arguments[0].click()', a) # The second is the one for the new database.
        
        # --> Extract the table info
        tableElem = WebDriverWait(self.driver, 20.0).until(EC.presence_of_element_located(( By.XPATH, './/table[@class = "TabelaResultado"]' )))
        tableHtml = BeautifulSoup(tableElem.get_attribute('outerHTML'), 'html.parser')
        
        records = []
        head, trows = tableHtml.find('thead'), tableHtml.find('tbody').find_all('tr')
        rename_cols = {hcell.attrs['id']: hcell.text.strip() for hcell in head.find_all('th')}
        for row in trows:
            record = { cell.attrs['headers'][0]: cell.text.replace("\n", "") for cell in row.find_all('td')  }
            records.append(record)
        tb_rec = pd.DataFrame(records).rename(rename_cols, axis=1)
        return tb_rec
    
    def download_file(self, req_number, tb_consulta):
        '''
        
        '''
        subsol = tb_consulta[tb_consulta["Número de Solicitação"]==req_number]
        if subsol.shape[0]==1 and subsol['Link'].iat[0]=='Download':
            link_dbf = WebDriverWait(self.driver, 20.0).until(EC.presence_of_all_elements_located((By.XPATH, './/a[@class = "link"]')))
            self.driver.execute_script('arguments[0].click()', link_dbf[0])
        

## Config

In [6]:
basepath = os.path.join(os.environ["HOMEPATH"], "Documents", "data")
download_folder = os.path.join(basepath, "SIVEP-Gripe")

In [42]:
download_folder
download_folder = r'C:\Users\higor.monteiro\Documents\data\SIVEP-Gripe'

In [43]:
certification = {
    "username": "millenakosloski@hotmail.com",
    "password": "Pitoco@1307"
}

In [44]:
args = {
    'certification': certification, 
    'ano_epi': '2021', 'semana_pri': '1', 'semana_ulti': '5',
    'download_folder': download_folder,
}
def download_sivep(args):
    certification=args['certification']
    download_folder = args['download_folder']
    epi_year, epi_week_first, epi_week_last = args['ano_epi'], args['semana_pri'], args['semana_ulti'] 
    
    sivep_browser = SivepPipe(certification=certification, download_folder=download_folder)
    sivep_browser.main_page()
    #time.sleep(0.5)
    req = sivep_browser.request_dbf('2021', '1', '5')
    print(f'No. Requisição: {req}')
    #time.sleep(0.5)
    sivep_browser.homepage()
    #time.sleep(0.5)
    consulta_tb = sivep_browser.query_file()
    print(consulta_tb)
    sivep_browser.download_file(req, consulta_tb)
    time.sleep(4)

In [45]:
download_sivep(args)

No. Requisição: 1969677
  Número de Solicitação Quantidade de Registros                   Status  \
0               1632382                       0         Em Processamento   
1               1632384                       0         Em Processamento   
2               1633119                       0         Em Processamento   
3               1888290                       0         Em Processamento   
4               1969677                    4121  Processamento Concluído   

       Link  
0            
1            
2            
3            
4  Download  


In [11]:
args = {
    'certification': certification, 
    'ano_epi': '2021', 'semana_pri': '1', 'semana_ulti': '5',
    'download_folder': download_folder,
}

certification=args['certification']
download_folder = args['download_folder']
epi_year, epi_week_first, epi_week_last = args['ano_epi'], args['semana_pri'], args['semana_ulti'] 

In [23]:
sivep_browser = SivepPipe(certification=certification, download_folder=download_folder, headless=True)

In [24]:
sivep_browser.main_page()

In [25]:
req = sivep_browser.request_dbf('2021', '1', '5')

In [26]:
req

'1969677'

In [27]:
sivep_browser.locate_page()

Pagina Principal\Exportação\Registros Individuais


In [28]:
sivep_browser.homepage()

In [29]:
tb = sivep_browser.query_file()

In [31]:
tb

Unnamed: 0,Número de Solicitação,Quantidade de Registros,Status,Link
0,1632382,0,Em Processamento,
1,1632384,0,Em Processamento,
2,1633119,0,Em Processamento,
3,1888290,0,Em Processamento,
4,1969677,4121,Processamento Concluído,Download


In [32]:
sivep_browser.download_file(req, tb)

In [33]:
sivep_browser.close_browser()

In [34]:
driver = sivep_browser.driver

In [35]:
driver

## Selenium Scraping

In [56]:
cells[0].attrs

{'headers': ['numeroSolicitacao']}

In [8]:
url = "https://sivepgripe.saude.gov.br/sivepgripe/login.html?0"

In [9]:
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {
  "download.default_directory": r"C:\Users\higor.monteiro\Documents\data\SIVEP-Gripe",
  "download.prompt_for_download": False,
})
driver = webdriver.Chrome(options=options)

In [10]:
driver.get(url)

In [11]:
usernm_field =  WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.ID, 'BoxAreaLogin_campo_email')))
passwd_field =  WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.ID, 'BoxAreaLogin_campo_senha')))
usernm_form = usernm_field.find_element(By.TAG_NAME, 'input')
passwd_form = passwd_field.find_element(By.TAG_NAME, 'input')
usernm_form.send_keys(certification["username"])
passwd_form.send_keys(certification["password"])

In [164]:
#usernm_form.clear()
#passwd_form.clear()

In [12]:
enter_field =  WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.NAME, 'ENTRAR')))
driver.execute_script('arguments[0].click()', enter_field)
#enter_field.click()

In [15]:
okbt =  WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.CLASS_NAME, 'ui-dialog-buttonset')))
okbt2 = okbt.find_element(By.TAG_NAME, 'button')
driver.execute_script('arguments[0].click()', okbt2)

In [16]:
a = WebDriverWait(driver, 20.0).until(EC.presence_of_all_elements_located((By.XPATH, './/a[@alt = "REGISTROS INDIVIDUAIS"]')))
driver.execute_script('arguments[0].click()', a[1])

In [14]:
a

[<selenium.webdriver.remote.webelement.WebElement (session="798a11b188cbac1c2324e644b73af466", element="1ABF318307A847D45B50209CEAE0E512_element_51")>,
 <selenium.webdriver.remote.webelement.WebElement (session="798a11b188cbac1c2324e644b73af466", element="1ABF318307A847D45B50209CEAE0E512_element_52")>]

In [17]:
select = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/select[@name = "tipoFicha"]')))

In [18]:
selectElem = Select(select)

In [19]:
selectElem.select_by_visible_text("SRAG Hospitalizado")

In [20]:
radio_epi = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:anoEpidemiologico"]')))
driver.execute_script('arguments[0].click()', radio_epi)

In [21]:
input_ano = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:anoAnoEpidemiologico"]')))
input_ano.send_keys('2023')

input_inicial = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:semanaInicial"]')))
input_inicial.send_keys('1')

input_final = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "periodo:semanaFinal"]')))
input_final.send_keys('28')

In [22]:
dados_pac_check = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "chkExportarDadosPaciente"]')))
driver.execute_script('arguments[0].click()', dados_pac_check)

In [23]:
gerar_dbf = WebDriverWait(driver, 20.0).until(EC.presence_of_element_located((By.XPATH, './/input[@name = "gerarDbf"]')))
driver.execute_script('arguments[0].click()', gerar_dbf)

In [128]:
#a[1].get_attribute('outerHTML')

In [24]:
a = WebDriverWait(driver, 20.0).until(EC.presence_of_all_elements_located((By.XPATH, './/a[@alt = "CONSULTAR EXPORTAÇÕES DBF"]')))
driver.execute_script('arguments[0].click()', a[0])

In [55]:
link_dbf = WebDriverWait(driver, 10.0).until(EC.presence_of_all_elements_located((By.XPATH, './/a[@class = "link"]')))
#driver.execute_script('arguments[0].click()', link_dbf[0])

TimeoutException: Message: 


In [135]:
driver.execute_script('arguments[0].click()', link_dbf[0])

In [56]:
driver.refresh()

In [57]:
driver.close()

In [1]:
import os
import sys
sys.path.append("..")
sys.path.append(os.path.join("..", "..", "linkage-saude"))

In [2]:
from epimonitor.collector import SivepPipe

PyTables is not installed. No support for HDF output.


In [3]:
download_folder = r'C:\Users\higor.monteiro\Documents\data\SIVEP-Gripe'
certification = {
    "username": "millenakosloski@hotmail.com",
    "password": "Pitoco@1307"
}

args = {
    'certification': certification, 
    'ano_epi': '2021', 'semana_pri': '1', 'semana_ulti': '5',
    'download_folder': download_folder,
}
epi_year, epi_week_first, epi_week_last = args['ano_epi'], args['semana_pri'], args['semana_ulti'] 

In [8]:
collector = SivepPipe(certification, download_folder=download_folder)

In [9]:
collector.login()

<epimonitor.collector.sivep_collector.SivepPipe at 0x15a348ae2e0>

In [10]:
collector.request_dbf('2021', '1', '3', requisition_export='tempfile.csv').query_file().download_file(verbose=True)

waiting for file from requisition 1970487, refresh 1 ... downloaded.


<epimonitor.collector.sivep_collector.SivepPipe at 0x15a348ae2e0>

In [6]:
collector.homepage().request_dbf('2021', '3', '4', requisition_export='tempfile.csv').query_file().download_file()

<epimonitor.collector.sivep_collector.SivepPipe at 0x1d89a5bdf10>

In [11]:
collector.close_browser()

In [10]:
collector.requisition_number

'1970487'