In [24]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error

In [2]:
# Cargar los datos completos
file_path = r'C:\Users\ldani\Documents\Patronus\Project\simulated-obstructive-disease-respiratory-pressure-and-flow-1.0.0\merged_dataset_balanced.csv'
df = pd.read_csv(file_path, sep=';')

  df = pd.read_csv(file_path, sep=';')


Análisis EDA del conjunto de datos

In [3]:
# Realizar análisis de valores faltantes
print("Valores faltantes por columna:")
print(df.isnull().sum())

Valores faltantes por columna:
Subject Number                       0
Age                                  0
Gender                               0
Height [cm]                          0
Weight [kg]                          0
PEEP                                 0
COPD                                 0
Time [s]                             0
Pressure [cmH2O]                     0
Flow [L/s]                           0
V_tidal [L]                          0
History of Smoking (yes/no)          0
Smoking Frequency               766080
History of vaping (yes/no)           0
Frequency of vaping             685440
Asthma (yes/no and severity)         0
dtype: int64


In [4]:
# Verificar tipos de datos
print(df.dtypes)

Subject Number                    int64
Age                               int64
Gender                           object
Height [cm]                      object
Weight [kg]                       int64
PEEP                              int64
COPD                              int64
Time [s]                        float64
Pressure [cmH2O]                float64
Flow [L/s]                      float64
V_tidal [L]                     float64
History of Smoking (yes/no)      object
Smoking Frequency                object
History of vaping (yes/no)       object
Frequency of vaping              object
Asthma (yes/no and severity)     object
dtype: object


In [5]:
# Función para convertir rangos a su valor medio
def convert_range_to_mean(value):
    if isinstance(value, str) and '-' in value:
        start, end = value.split('-')
        return (float(start) + float(end)) / 2
    return value

# Aplicar la función a las columnas numéricas
num_cols = ['Height [cm]']
df[num_cols] = df[num_cols].applymap(convert_range_to_mean)

# Verificar tipos de datos
print(df.dtypes)

  df[num_cols] = df[num_cols].applymap(convert_range_to_mean)


Subject Number                    int64
Age                               int64
Gender                           object
Height [cm]                     float64
Weight [kg]                       int64
PEEP                              int64
COPD                              int64
Time [s]                        float64
Pressure [cmH2O]                float64
Flow [L/s]                      float64
V_tidal [L]                     float64
History of Smoking (yes/no)      object
Smoking Frequency                object
History of vaping (yes/no)       object
Frequency of vaping              object
Asthma (yes/no and severity)     object
dtype: object


In [6]:
# Rellenar los espacios en blanco en las columnas especificadas
df['Smoking Frequency'] = df['Smoking Frequency'].fillna('Never')
df['Frequency of vaping'] = df['Frequency of vaping'].fillna('Never')

print(df.head())

# Realizar análisis de valores faltantes
print("Valores faltantes por columna:")
print(df.isnull().sum())

   Subject Number  Age  Gender  Height [cm]  Weight [kg]  PEEP  COPD  \
0               1   24  Female        164.5           65     0     0   
1               1   24  Female        164.5           65     0     0   
2               1   24  Female        164.5           65     0     0   
3               1   24  Female        164.5           65     0     0   
4               1   24  Female        164.5           65     0     0   

   Time [s]  Pressure [cmH2O]  Flow [L/s]  V_tidal [L]  \
0      0.00          2.681998   -0.577100     0.000000   
1      0.01          2.617630   -0.597392    -0.005872   
2      0.02          2.681998   -0.597392    -0.011846   
3      0.03          2.617630   -0.626598    -0.017966   
4      0.04          2.649814   -0.626598    -0.024232   

  History of Smoking (yes/no) Smoking Frequency History of vaping (yes/no)  \
0                          No             Never                         No   
1                          No             Never               

In [7]:
# Asegurarse de que las columnas no tengan espacios adicionales
df.columns = df.columns.str.strip()

# Crear una columna binaria para indicar si hay EPOC o no
df['has_EPOC'] = df['COPD'].apply(lambda x: 1 if x > 0 else 0)

In [8]:
# Convertir columnas categóricas a numéricas usando OneHotEncoder
categorical_features = ['Gender', 'History of Smoking (yes/no)', 'History of vaping (yes/no)', 'Asthma (yes/no and severity)']
numerical_features = ['Age', 'Height [cm]', 'Weight [kg]', 'PEEP', 'Time [s]', 'Pressure [cmH2O]', 'Flow [L/s]', 'V_tidal [L]']

In [9]:
# Crear el transformador de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [10]:
# Preprocesar las características
X = df.drop(columns=['Subject Number', 'COPD', 'has_EPOC'])
y_classification = df['has_EPOC']
y_regression = df['COPD']

In [11]:
# Aplicar el preprocesamiento
X_preprocessed = preprocessor.fit_transform(X)

In [12]:
# Convertir el resultado preprocesado a un DataFrame
feature_names = preprocessor.get_feature_names_out()
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

In [13]:
# Añadir las columnas de 'Subject Number' y 'has_EPOC' al DataFrame preprocesado
X_preprocessed_df['Subject Number'] = df['Subject Number'].values
X_preprocessed_df['has_EPOC'] = y_classification.values
X_preprocessed_df['COPD'] = y_regression.values

In [14]:
# Dividir los datos en entrenamiento y prueba con estratificación por sujeto
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Obtener los índices para el conjunto de entrenamiento y prueba
train_idx, test_idx = next(gss.split(X_preprocessed_df, groups=X_preprocessed_df['Subject Number']))

train_data = X_preprocessed_df.iloc[train_idx]
test_data = X_preprocessed_df.iloc[test_idx]

In [15]:
# Guardar los datos en archivos CSV temporales para que AutoGluon pueda cargarlos
train_data.to_csv("train_data_classification.csv", index=False)
test_data.to_csv("test_data_classification.csv", index=False)

Modelo de ML se dividirá en 2 secciones:
1. Modelo de Clasificación (EPOC: 0 & EPOC:1) siendo si el sujeto presenta o no EPOC
2. Modelo de Regresión, nivel de EPOC presente en el sujeto

In [16]:
# Entrenar el modelo de clasificación con AutoGluon
predictor_classification = TabularPredictor(label='has_EPOC', problem_type='binary').fit(
    train_data="train_data_classification.csv",
    time_limit=3600  # Puedes ajustar el límite de tiempo según sea necesario
)

No path specified. Models will be saved in: "AutogluonModels\ag-20240624_013432"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Loaded data from: train_data_classification.csv | Columns = 22 / 22 | Rows = 645120 -> 645120
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels\ag-20240624_013432"
AutoGluon Version:  1.1.0
Python Version:     3.11.5
Operating System:   Windows
Platform Machine:   AMD64
Platform Ver

In [17]:
# Predecir en el conjunto de prueba
test_data_classification = pd.read_csv('test_data_classification.csv')
classification_predictions = predictor_classification.predict(test_data_classification)
print("Predictions on test set (classification):\n", classification_predictions)

# Evaluar el modelo de clasificación
classification_performance = predictor_classification.evaluate(test_data_classification)
print("Performance on test set (classification):\n", classification_performance)

Predictions on test set (classification):
 0         0
1         0
2         0
3         0
4         0
         ..
161275    1
161276    1
161277    1
161278    1
161279    1
Name: has_EPOC, Length: 161280, dtype: int64
Performance on test set (classification):
 {'accuracy': 0.9961991567460318, 'balanced_accuracy': 0.9923983134920635, 'mcc': 0.9898641563428314, 'roc_auc': 1.0, 'f1': 0.9974725088956967, 'precision': 0.9949577620030764, 'recall': 1.0}


In [18]:
# Añadir las predicciones al conjunto de prueba para filtrar los que tienen EPOC
test_data_classification['has_EPOC'] = classification_predictions

# Filtrar los datos con EPOC para la regresión
train_data_with_EPOC = train_data[train_data['has_EPOC'] == 1]
test_data_with_EPOC = test_data_classification[test_data_classification['has_EPOC'] == 1]

# Guardar los conjuntos con EPOC en archivos CSV
train_data_with_EPOC.to_csv('train_data_regression.csv', index=False)
test_data_with_EPOC.to_csv('test_data_regression.csv', index=False)

In [26]:
#Definimos los mejores hiperparámetros y modelos de la prueba de Autogluon en entregables anteriores para regresión
hyperparameters = {
    'GBM': ['GBMLarge'],
    'KNN': [{'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}]
}

# Configurar el ensamble con los pesos específicos
ensemble_kwargs = {
    'weights': {'GBMLarge': 0.786, 'KNeighborsDist': 0.214}
}

In [22]:
#Cargamos el modelo de regresión para volver a entrenar
path_predictor_regression = rf"C:\Users\ldani\Documents\Patronus\Project\AutogluonModels\ag-20240612_190315"

In [27]:
# Entrenar el modelo de regresión con AutoGluon
predictor_regression = TabularPredictor(label='COPD', problem_type='regression',path=path_predictor_regression).fit(
    train_data='train_data_regression.csv',
    presets='best_quality', # Presets para asegurar la mejor calidad de modelos
    hyperparameters=hyperparameters,
    time_limit=3600  # Puedes ajustar el límite de tiempo según sea necesario
)

Presets specified: ['best_quality']
Loaded data from: train_data_regression.csv | Columns = 22 / 22 | Rows = 483840 -> 483840
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: C:\Users\ldani\Documents\Patronus\Project\AutogluonModels\ag-20240612_190315\ds_sub_fit\sub_fit_ho.
Running the sub-fit in a ra

In [None]:
# Predecir en el conjunto de prueba
test_data_regression = pd.read_csv('test_data_regression.csv')
regression_predictions = predictor_regression.predict(test_data_regression)

# Evaluar el modelo de regresión
rmse = mean_squared_error(test_data_regression['COPD'], regression_predictions, squared=False)
print("RMSE on test set (regression):", rmse)