In [1]:
!pip install cffi>=1.17.0 tqdm>=4.64.1 numpy<2.0.0;
!pip install pysus;

/bin/bash: line 1: 2.0.0: No such file or directory
Collecting pysus
  Downloading pysus-0.15.0-py3-none-any.whl.metadata (1.3 kB)
Collecting Unidecode<2.0.0,>=1.3.6 (from pysus)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting aioftp<0.22.0,>=0.21.4 (from pysus)
  Downloading aioftp-0.21.4-py3-none-any.whl.metadata (6.1 kB)
Collecting bigtree<0.13.0,>=0.12.2 (from pysus)
  Downloading bigtree-0.12.5-py3-none-any.whl.metadata (28 kB)
Collecting cffi==1.15.1 (from pysus)
  Downloading cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting dateparser<2.0.0,>=1.1.8 (from pysus)
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Collecting dbfread==2.0.7 (from pysus)
  Downloading dbfread-2.0.7-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting elasticsearch==7.16.2 (from elasticsearch[preprocessing]==7.16.2->pysus)
  Downloading elasticsearch-7.16.2-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting fastpar

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

In [2]:
np.random.seed(42)

In [3]:
from pysus import SIH
sih = SIH().load()
sih.metadata

{'long_name': 'Sistema de Informações Hospitalares',
 'source': ('https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/',
  'https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/'),
 'description': 'A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os atendimentos que provenientes de internações hospitalares que foram financiadas pelo SUS, e após o processamento, gerarem relatórios para os gestores que lhes possibilitem fazer os pagamentos dos estabelecimentos de saúde. Além disso, o nível Federal recebe mensalmente uma base de dados de todas as internações autorizadas (aprovadas ou não para pagamento) para que possam ser repassados às Secretarias de Saúde os valores de Produção de Média e Alta complexidade além dos valores de CNRAC, FAEC e de Hospitais Universitários – em suas variadas formas de contrato de gestão.'}

In [4]:
files = sih.get_files(["RD"], uf=["SP"], year=2024, month=[1, 2, 3])

In [5]:
sih.download(files)

RDSP2403.parquet: 100%|██████████| 849k/849k [00:44<00:00, 19.0kB/s]


[/root/pysus/RDSP2401.parquet,
 /root/pysus/RDSP2402.parquet,
 /root/pysus/RDSP2403.parquet]

In [7]:
for index in range(len(files)):
  parquet = sih.download(files[index])

17462541it [00:00, 15461939110.51it/s]
17268103it [00:00, 16472065837.01it/s]
18999157it [00:00, 25305887647.42it/s]


In [8]:
df = parquet.to_dataframe()

In [9]:
df.fillna(0, inplace=True)

In [10]:
for col in df.columns:
    if pd.api.types.is_string_dtype(df[col]):
        stripped = df[col].dropna().str.strip()
        if stripped.apply(lambda x: x.replace('.', '', 1).isdigit()).all():
          df[col] = pd.to_numeric(stripped)

In [11]:
label_encoder = LabelEncoder()

for col in df:
  if df[col].dtype == 'string':
    df[col] = label_encoder.fit_transform(df[col])

In [12]:
df.head()

Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,...,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,352230,2024,3,3,446,3524115831412,1,18200001,352230,19480808,...,0,1,0,0,0,0,0,0,0,0
1,352230,2024,3,3,446,3524105805759,1,18200001,352230,19900210,...,0,1,0,0,0,0,0,0,0,0
2,352230,2024,3,3,446,3524105805760,1,18200001,352230,19610927,...,0,1,0,0,0,0,0,0,0,0
3,352230,2024,3,3,446,3524105805770,1,18250000,351850,20041022,...,0,1,0,0,0,0,0,0,0,0
4,352230,2024,3,3,446,3524105805781,1,18250000,351850,19370328,...,0,1,0,0,0,0,0,0,0,0


In [13]:
df.shape

(241806, 113)

In [14]:
to_remove = [
    'UF_ZI', 'CEP', 'MUNIC_RES', 'NASC', 'SEXO', 'IDADE', 'COD_IDADE',
    'NACIONAL', 'RACA_COR', 'ETNIA', 'NUM_FILHOS', 'INSTRU', 'CBOR', 'CNAER',
    'VINCPREV', 'CONTRACEP1', 'CONTRACEP2', 'GESTRISCO', 'INSC_PN', 'IND_VDRL',
    'CID_NOTIF', 'HOMONIMO', 'VAL_SH', 'VAL_SP', 'VAL_SADT', 'VAL_RN',
    'VAL_ACOMP', 'VAL_ORTP', 'VAL_SANGUE', 'VAL_SADTSR', 'VAL_TRANSP',
    'VAL_OBSANG', 'VAL_PED1AC', 'VAL_UTI', 'US_TOT', 'VAL_UCI ', 'UTI_MES_AN',
    'VAL_SH_FED', 'VAL_SP_FED', 'VAL_SH_GES', 'VAL_SP_GES', 'UTI_MES_IN',
    'UTI_MES_AL', 'UTI_INT_IN', 'UTI_INT_AN', 'UTI_INT_AL', 'NATUREZA', 'NAT_JUR',
    'GESTAO', 'RUBRICA', 'MUNIC_MOV', 'AUD_JUST', 'SIS_JUST', 'MARCA_UCI',
    'NUM_PROC', 'TOT_PT_SP', 'CPF_AUT', 'SEQ_AIH5', 'GESTOR_COD', 'GESTOR_TP',
    'GESTOR_CPF', 'GESTOR_DT', 'CNPJ_MANT', 'INFESHOP', 'REGCT', 'SEQUENCIA',
    'REMESSA',  'DIAGSEC1', 'DIAGSEC2', 'TPDISEC7','TPDISEC8', 'TPDISEC9'
    'DIAGSEC3', 'DIAGSEC4', 'DIAGSEC5', 'DIAGSEC6', 'DIAGSEC7', 'DIAGSEC8',
    'DIAGSEC9', 'TPDISEC1', 'TPDISEC2', 'TPDISEC3', 'TPDISEC4', 'TPDISEC5',
    'TPDISEC6'
]

df = df.drop(columns=[col for col in to_remove if col in df.columns])
df.shape

(241806, 32)

In [15]:
df_x = df.drop('VAL_TOT', axis=1)
df_y = df['VAL_TOT']

In [16]:
variace_threshold = 0.80

scaler = StandardScaler()
df_standardized = scaler.fit_transform(df_x)

pca = PCA()
pca.fit(df_standardized)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
pca_number = np.argmax(cumulative_variance >= 0.80) + 1
print(f"Número de componentes para 80% da variância: {pca_number}")

Número de componentes para 80% da variância: 13


In [17]:
pca = PCA(n_components=pca_number)
main_components = pca.fit_transform(df_standardized)
components_num = main_components.shape[1]
column_names = [f'PC{i}' for i in range(1, components_num + 1)]
pca_df = pd.DataFrame(data=main_components, columns=column_names)
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13
0,-2.282499,-0.295329,0.036349,-1.706452,-0.777676,0.49535,-0.421728,-1.910521,2.627111,0.331534,-0.001318,-1.423381,1.10185
1,-0.796085,-0.758886,-0.347953,0.524214,-0.352071,-0.802019,0.128755,-0.528681,-0.339043,0.04901,0.368783,0.585112,0.447954
2,-2.608439,0.758699,-0.925514,0.049872,-0.829261,0.18333,2.928021,-1.066023,0.451043,-0.170753,0.132269,-0.156843,0.65888
3,-0.809645,-0.974728,-0.231844,0.198926,-0.038278,-0.344381,-0.347047,-0.785265,-0.084415,0.13977,0.355376,0.394343,0.769716
4,-0.572669,-1.137693,-0.109724,0.426658,0.062291,-0.099737,0.08205,-0.605848,-0.77885,-0.037397,0.30856,0.740124,0.761276


In [19]:
x_train, x_test, y_train, y_test = train_test_split(
    pca_df, df_y, test_size=0.3, random_state=42, shuffle=True
)

print(f"Dados -> Treinamento {len(x_train)} - Teste: {len(x_test)}")
print(f"Target -> Treinamento {len(y_train)} - Teste: {len(y_test)}")

Dados -> Treinamento 169264 - Teste: 72542
Target -> Treinamento 169264 - Teste: 72542


In [22]:
best_params = {
    'n_estimators': 200,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'log2',
    'max_depth': 30
}

pipeline_rf = make_pipeline(
    RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
)

pipeline_rf.fit(x_train, y_train)
y_pred = pipeline_rf.predict(x_test)

In [20]:
def evaluate_model(name, y_test, y_pred):
    print(f"\n{name}")
    lin_mse = mean_squared_error(y_test, y_pred)
    print(f"MSE: {lin_mse:.2f}")
    lin_rmse = np.sqrt(lin_mse)
    print(f"RMSE: {lin_rmse:.2f}")
    lin_mae = mean_absolute_error(y_test, y_pred)
    print(f"MAE: {lin_mae:.2f}")
    r2 = r2_score(y_test, y_pred)
    print(f"R²: {r2:.2f}")

In [23]:
evaluate_model("Random Forest", y_test, y_pred)


Random Forest
MSE: 5797080.16
RMSE: 2407.71
MAE: 560.51
R²: 0.74
