# Limpeza & Transform — SINAN Arboviroses

Objetivos:
- aplicar limpeza estruturada (tipos, idades, duplicatas),
- gerar variáveis derivadas (faixa etária, binários de sintomas),
- exportar dataset intermediário e final (processado).


In [1]:
# Configuração de ambiente e caminhos
import os
import sys
from pathlib import Path
import pandas as pd

# Ajusta sys.path para importar pacotes da raiz do projeto
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from src.data.make_dataset import load_raw_data, basic_cleaning, save_interim, save_processed
from src.features.build_features import build_feature_matrix
from src.utils.helpers import ensure_dirs

RAW = "data/raw/sinan_arboviroses_2024.csv"
INTERIM = "data/interim/sinan_arboviroses_2024_clean.csv"
PROCESSED = "data/processed/sinan_arboviroses_2024_features.csv"

ensure_dirs([os.path.dirname(INTERIM), os.path.dirname(PROCESSED)])


In [2]:
df_raw = load_raw_data(RAW)
print("Bruto:", df_raw.shape)
df_raw.head()


  return pd.read_csv(path, sep=sep, encoding=encoding)


Bruto: (6434137, 121)


Unnamed: 0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,...,EVIDENCIA,PLAQ_MENOR,CON_FHD,COMPLICA,TP_SISTEMA,NDUPLIC_N,DT_DIGITA,CS_FLXRET,FLXRECEBI,MIGRADO_W
0,2,A90,2024-01-24,202404,2024,12,120020,1941.0,6788637.0,2024-01-17,...,,,,,2.0,,2024-02-22,0.0,,
1,2,A90,2024-01-03,202401,2024,12,120033,1941.0,2000083.0,2024-01-02,...,,,,,2.0,,2024-01-10,0.0,,
2,2,A90,2024-01-14,202403,2024,12,120033,1941.0,2000083.0,2024-01-12,...,,,,,2.0,,2024-01-25,0.0,,
3,2,A90,2024-02-16,202407,2024,12,120042,1941.0,2000016.0,2024-02-10,...,,,,,2.0,,2024-02-26,1.0,,
4,2,A90,2024-01-16,202403,2024,12,120020,1941.0,5336171.0,2024-01-12,...,,,,,2.0,,2024-02-12,1.0,,


In [3]:
df_clean = basic_cleaning(df_raw)
print("Limpo:", df_clean.shape)
df_clean.head()


[dedupe] removidas 5023216 duplicatas com base em ['ID_UNIDADE', 'DT_NOTIFIC', 'ID_MUNICIP', 'ID_MN_RESI', 'ID_PAIS']
Limpo: (1410921, 122)


Unnamed: 0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,...,PLAQ_MENOR,CON_FHD,COMPLICA,TP_SISTEMA,NDUPLIC_N,DT_DIGITA,CS_FLXRET,FLXRECEBI,MIGRADO_W,ATRASO_NOTIF_DIAS
0,2,A90,2024-01-24,202404,2024,12,120020,1941.0,6788637.0,2024-01-17,...,,,,2.0,,2024-02-22,0.0,,,7
1,2,A90,2024-01-03,202401,2024,12,120033,1941.0,2000083.0,2024-01-02,...,,,,2.0,,2024-01-10,0.0,,,1
2,2,A90,2024-01-14,202403,2024,12,120033,1941.0,2000083.0,2024-01-12,...,,,,2.0,,2024-01-25,0.0,,,2
3,2,A90,2024-02-16,202407,2024,12,120042,1941.0,2000016.0,2024-02-10,...,,,,2.0,,2024-02-26,1.0,,,6
4,2,A90,2024-01-16,202403,2024,12,120020,1941.0,5336171.0,2024-01-12,...,,,,2.0,,2024-02-12,1.0,,,4


In [4]:
# Exemplos (ajuste às suas regras/negócio):
# 1) Remover notificações sem data de sintomas:
if "DT_SIN_PRI" in df_clean.columns:
    before = len(df_clean)
    df_clean = df_clean[~df_clean["DT_SIN_PRI"].isna()]
    print(f"Removidos {before - len(df_clean)} registros sem DT_SIN_PRI")

# 2) Idades muito raras (ex.: 0 e 120 — mantenha, mas você pode investigar):
# df_clean.query("NU_IDADE_N in [0,120]").head()


Removidos 0 registros sem DT_SIN_PRI


In [5]:
df_feat = build_feature_matrix(df_clean)
print("Com features:", df_feat.shape)
df_feat.head()


Com features: (1410921, 138)


Unnamed: 0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,...,NAUSEA_BIN,DOR_COSTAS_BIN,CONJUNTVIT_BIN,ARTRITE_BIN,ARTRALGIA_BIN,PETEQUIA_N_BIN,LEUCOPENIA_BIN,DOR_RETRO_BIN,N_SINTOMAS,SEXO_TXT
0,2,A90,2024-01-24,202404,2024,12,120020,1941.0,6788637.0,2024-01-17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,Masculino
1,2,A90,2024-01-03,202401,2024,12,120033,1941.0,2000083.0,2024-01-02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,Masculino
2,2,A90,2024-01-14,202403,2024,12,120033,1941.0,2000083.0,2024-01-12,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,Masculino
3,2,A90,2024-02-16,202407,2024,12,120042,1941.0,2000016.0,2024-02-10,...,,,,,,,,,0.0,Masculino
4,2,A90,2024-01-16,202403,2024,12,120020,1941.0,5336171.0,2024-01-12,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,Masculino


In [None]:
# Casos por ano x UF (se colunas existirem)
if {"NU_ANO","SG_UF_NOT"} <= set(df_feat.columns):
    cases_ano_uf = pd.pivot_table(
        df_feat, index="NU_ANO", columns="SG_UF_NOT", values="TP_NOT", aggfunc="count", fill_value=0
    )
    cases_ano_uf.tail()
    cases_ano_uf.to_csv("reports/tables/casos_ano_uf.csv")

# Média de idade por UF
if {"NU_IDADE_N","SG_UF_NOT"} <= set(df_feat.columns):
    idade_uf = df_feat.groupby("SG_UF_NOT")["NU_IDADE_N"].mean().sort_values(ascending=False).to_frame("idade_media")
    idade_uf.head(10)
    idade_uf.to_csv("reports/tables/idade_media_por_uf.csv")


In [8]:
save_interim(df_clean, INTERIM)
save_processed(df_feat, PROCESSED)
print("✅ Arquivos salvos:")
print(" -", INTERIM)
print(" -", PROCESSED)


✅ Arquivos salvos:
 - data/interim/sinan_arboviroses_2024_clean.csv
 - data/processed/sinan_arboviroses_2024_features.csv
