## **Script:** 
**Download SIVEP-GRIPE data**

In [6]:
import os
import sys
sys.path.append( os.path.dirname(os.path.abspath('')) )
sys.path.append( os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('')), '..', 'linkage-saude')) )

import glob
import shutil
import zipfile
import numpy as np
import pandas as pd
import datetime as dt
from simpledbf import Dbf5

from epimonitor import WarehouseSUS

In [10]:
# -- connect with the database
datapath = os.path.join(os.environ["HOMEPATH"], "Documents", "data")
suspath = os.path.join(datapath, "DATASUS_WAREHOUSE", "datasus_pessoas.db")
engine_url = f"sqlite:///{suspath}"

warehouse = WarehouseSUS(engine_url)
engine = warehouse.db_init()

In [11]:
# -- folder where downloaded data is stored
basefolder = os.path.join(os.environ["HOMEPATH"], 'Documents', 'data', 'SIVEP-Gripe', 'DOWNLOAD_AUTO')

# -- collection folders are named with dates
date_folders = [ dt.date(int(n.split("-")[2]), int(n.split("-")[1]), int(n.split("-")[0]) ) for n in os.listdir(basefolder)  ]

# -- select the most recent data
nargmax = date_folders.index(max(date_folders))
selected_folder = os.listdir(basefolder)[nargmax]

# -- create separated folder for uncompressed files
extracted_path = os.path.join(basefolder, selected_folder, "EXTRACTED")
if not os.path.isdir(extracted_path):
    os.mkdir(extracted_path)

list_of_zipfiles = [ os.path.basename(x) for x in glob.glob(os.path.join(basefolder, selected_folder, "*.zip")) ]

for current_file in list_of_zipfiles:
    # -- extract file
    print(f'extracting {current_file} ... ', end='')
    with zipfile.ZipFile( os.path.join(basefolder, selected_folder, current_file), 'r') as fzip:
        fzip.extractall(extracted_path)

    # -- load and preprocess DBF file
    dbf_file = glob.glob(os.path.join(extracted_path, "*.dbf"))[0] # it should be just one
    
    print(f'loading and preprocessing of {os.path.basename(dbf_file)} ... ', end='')
    cur_sivep = Dbf5(dbf_file, codec='latin').to_dataframe()
    cur_sivep["DT_NASC"] = pd.to_datetime(cur_sivep["DT_NASC"], format="%d/%m/%Y", errors="coerce")
    cur_sivep["DT_NOTIFIC"] = pd.to_datetime(cur_sivep["DT_NOTIFIC"], format="%d/%m/%Y", errors="coerce")

    min_year, max_year = cur_sivep["DT_NOTIFIC"].min().year, cur_sivep["DT_NOTIFIC"].max().year
    list_of_ids = []
    for current_year in np.arange(min_year, max_year+1, 1):
        list_of_ids += [ pd.DataFrame(warehouse.query_id('sivep_gripe', current_year)) ]
    list_of_ids = pd.concat(list_of_ids)
    if list_of_ids.shape[0]>0:
        list_of_ids = list_of_ids["ID_SIVEP"]

    # -- remove from the dbf the records already present in the database
    cur_sivep_new = cur_sivep[~cur_sivep["NU_NOTIFIC"].isin(list_of_ids)].copy()
    print(f"{cur_sivep_new.shape[0]} new records to be added to the database ... ", end='')

    # -- insert records
    warehouse.insert('sivep_gripe', cur_sivep_new, batchsize=200, verbose=True)
    
    # -- delete extracted file
    print("done.")
    os.remove(dbf_file)
    
os.remove(extracted_path)

#for cur_zipfile in list_of_zipfiles:
#    with zipfile.ZipFile( os.path.join(basefolder, selected_folder, cur_zipfile), 'r') as fzip:
#        fzip.extractall(extracted_path)

# -- now, process each extracted file and insert in the database
#dbf_files = glob.glob(os.path.join(extracted_path, "*.dbf"))



#for cur_fullpath in dbf_files:
#    cur_sivep = Dbf5(cur_fullpath, codec='latin').to_dataframe()
#
#    cur_sivep["DT_NASC"] = pd.to_datetime(cur_sivep["DT_NASC"], format="%d/%m/%Y", errors="coerce")
#    cur_sivep["DT_NOTIFIC"] = pd.to_datetime(cur_sivep["DT_NOTIFIC"], format="%d/%m/%Y", errors="coerce")
#
#    warehouse.insert('sivep_gripe', cur_sivep, batchsize=100, verbose=True)
#
#
## -- delete extracted files
#shutil.rmtree(extracted_path)




# Uncomment if you want to reset table
#warehouse.delete_table('sivep_gripe', is_sure=True, authkey="###!Y!.")
#warehouse.db_init()

#warehouse.insert('sivep_gripe', sample_df1, batchsize=50, verbose=True)


extracting 1971704DBF.zip ... loading and preprocessing of SRAGHOSPITALIZADO1971704_00.dbf ... 2 new records to be added to the database ... Insertion of batch 1 of 1 ... done.
done.
extracting 1971705DBF.zip ... loading and preprocessing of SRAGHOSPITALIZADO1971705_00.dbf ... 3 new records to be added to the database ... Insertion of batch 1 of 1 ... done.
done.
extracting 1971706DBF.zip ... loading and preprocessing of SRAGHOSPITALIZADO1971706_00.dbf ... 563 new records to be added to the database ... Insertion of batch 1 of 3 ... done.
Insertion of batch 2 of 3 ... done.
Insertion of batch 3 of 3 ... done.
done.
extracting 1971707DBF.zip ... loading and preprocessing of SRAGHOSPITALIZADO1971707_00.dbf ... 310 new records to be added to the database ... Insertion of batch 1 of 2 ... done.
Insertion of batch 2 of 2 ... done.
done.
extracting 1971708DBF.zip ... loading and preprocessing of SRAGHOSPITALIZADO1971708_00.dbf ... 18143 new records to be added to the database ... Insertion o

PermissionError: [WinError 5] Acesso negado: '\\Users\\higor.monteiro\\Documents\\data\\SIVEP-Gripe\\DOWNLOAD_AUTO\\08-08-2023\\EXTRACTED'

In [12]:
warehouse.number_of_records('sivep_gripe')

99878

In [14]:
id2023 = pd.DataFrame(warehouse.query_id('sivep_gripe', 2020))

id2023

Unnamed: 0,ID_SIVEP,DATA_NOTIFICACAO
0,315801380374,2020-01-23
1,315780509443,2020-01-03
2,315783131308,2020-01-03
3,315783158990,2020-01-02
4,315786647665,2020-01-09
...,...,...
28098,31656335174806,2020-12-16
28099,31656416269016,2020-12-29
28100,31656417755292,2020-07-23
28101,31676552978279,2020-12-09


In [4]:
warehouse.delete_table('sivep_gripe', is_sure=True, authkey="###!Y!.")

In [7]:
list_of_ids

0       31677085952682
1       31677175381109
2       31677256163177
3       31677256575574
4       31677260374996
             ...      
6815    31691444376100
6816    31691459456365
6817    31691461095478
6818    31691470614946
6819    31691483482691
Name: ID_SIVEP, Length: 6820, dtype: object

In [None]:
collector.close_browser()

In [13]:
shutil.rmtree(extracted_path)

In [None]:
cur_sivep[cur_sivep["NU_NOTIFIC"]=="316196231305"][["NM_PACIENT", "DT_NOTIFIC", "SEM_NOT"]]

Unnamed: 0,NM_PACIENT,DT_NOTIFIC,SEM_NOT
17903,MARINETE DO NASCIMENTO VIEIRA,2021-03-01,9


In [None]:
warehouse.query_all()

[('316129905398', datetime.datetime(2021, 2, 10, 0, 0), 'MARINETE DO NASCIMENTO VIEIRA', datetime.datetime(1970, 4, 15, 0, 0), 'F', 'MARIA BORGES DO NASCIMENTO FRANCA', 'DAS FLORES', '84', 'VILA VELHA', '230440', '60349295', None, '74788744368', '7980310', datetime.datetime(2023, 8, 8, 15, 25, 35, 612321), datetime.datetime(2023, 8, 8, 15, 25, 35, 612321)),
 ('316196231305', datetime.datetime(2021, 3, 1, 0, 0), 'MARINETE DO NASCIMENTO VIEIRA', datetime.datetime(1979, 4, 15, 0, 0), 'F', 'MARIA BORGES DO NASCIMENTO', 'RUA DAS FLORES', '84', 'BARRO DO CEARA', '230440', None, None, None, '2561492', datetime.datetime(2023, 8, 8, 15, 28, 3, 566361), datetime.datetime(2023, 8, 8, 15, 28, 3, 566361))]

In [4]:
from sqlalchemy import select

In [11]:
table_model = warehouse.tables['sivep_gripe']
sel = select(table_model.c['ID_SIVEP'], table_model.c['DATA_NOTIFICACAO'])

In [19]:
engine = warehouse._engine

with engine.connect() as conn:
    rp = conn.execute(sel)
    for cur_record in rp:
        pass
    results = [ record for record in rp ]

In [13]:
results[:5]

[('315487038336', datetime.datetime(2019, 1, 20, 0, 0)),
 ('31646415229798', datetime.datetime(2022, 3, 4, 0, 0)),
 ('315480952153', datetime.datetime(2019, 1, 2, 0, 0)),
 ('315541216555', datetime.datetime(2019, 1, 2, 0, 0)),
 ('316397719239', datetime.datetime(2021, 9, 26, 0, 0))]

In [26]:
cur_record

('31691483482691', datetime.datetime(2023, 8, 8, 0, 0))