In [822]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
import numpy.typing as npt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

import xgboost as xgb

In [823]:
# constants

RANDOM_STATE: np.int8 = 42
TEST_SIZE: np.float64 = 0.2

MAXIMUM_UNIT_LENGTH_STAY: np.int32 = 24
NUMBER_OF_BINS: np.int8 = 2

In [824]:
# getting data from comorbidades and capacidade funcional, plus ages and labels
df: pd.DataFrame = pd.read_csv("data/folha3.csv")
ages: pd.Series = df["Age"].copy()
df: pd.DataFrame = df[df["UnitLengthStay"].apply(lambda x: x <= MAXIMUM_UNIT_LENGTH_STAY)].copy()
df.dropna(axis=1, thresh=5000, inplace=True)
df.dropna(axis=0, inplace=True)
labels: pd.DataFrame = df["UnitLengthStay"].copy()
df = df.iloc[:, 22:]
sheet_3_df: pd.DataFrame = (df == "Verdadeiro").astype(int)

# getting data from motivos de internação na UTI
df = pd.read_csv("data/folha4.csv")
df: pd.DataFrame = df[df["UnitLengthStay"].apply(lambda x: x <= MAXIMUM_UNIT_LENGTH_STAY)].copy()
df = df.iloc[:, 21:]
df.dropna(axis=1, thresh=5000, inplace=True)
df.dropna(axis=0, inplace=True)
sheet_4_df: pd.DataFrame = (df == "Verdadeiro").astype(int)

# joining dataframes
final_data: pd.DataFrame = pd.concat([ages, sheet_3_df, sheet_4_df], axis=1, join="inner")

labels_regression: pd.DataFrame = labels.copy()
labels_classification: pd.DataFrame = pd.cut(x=labels_regression, bins=[0, 2, MAXIMUM_UNIT_LENGTH_STAY], labels=np.arange(NUMBER_OF_BINS))

# REGRESSION

In [825]:
X_train, X_test, y_train, y_test = train_test_split(
    final_data, labels_regression, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
NUMBER_OF_ESTIMATORS: np.int32 = 400

In [826]:
model = RandomForestRegressor(n_estimators=NUMBER_OF_ESTIMATORS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

14.977336987241147

In [827]:
model = xgb.XGBRegressor(n_estimators=NUMBER_OF_ESTIMATORS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

16.60385834710524

# CLASSIFICATION

In [828]:
X_train, X_test, y_train, y_test = train_test_split(
    final_data, labels_classification, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
NUMBER_OF_ESTIMATORS: np.int32 = 400

In [841]:
model = RandomForestClassifier(n_estimators=NUMBER_OF_ESTIMATORS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average=None)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.1s finished


array([0.60496614, 0.57282343])

In [840]:
model = xgb.XGBClassifier(n_estimators=NUMBER_OF_ESTIMATORS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average=None)

array([0.6286117 , 0.53731343])

In [838]:
model = GradientBoostingClassifier(n_estimators=NUMBER_OF_ESTIMATORS)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average=None)

      Iter       Train Loss   Remaining Time 
         1           1.3664            2.01s
         2           1.3552            1.97s
         3           1.3458            1.96s
         4           1.3382            1.99s
         5           1.3315            1.90s
         6           1.3259            1.80s
         7           1.3205            1.73s
         8           1.3159            1.76s
         9           1.3114            1.75s
        10           1.3075            1.70s
        20           1.2793            1.49s
        30           1.2620            1.45s
        40           1.2504            1.48s
        50           1.2416            1.38s
        60           1.2322            1.30s
        70           1.2248            1.24s
        80           1.2176            1.18s
        90           1.2115            1.13s
       100           1.2055            1.07s
       200           1.1573            0.67s
       300           1.1229            0.33s
       40

array([0.66756575, 0.54139535])