In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import uniform, randint
from xgboost import XGBRegressor

In [2]:
# Función para extraer PEEP y EPOC del nombre del archivo
def extract_peep_epoc(filename):
    parts = filename.split('_')
    peep = int(parts[2].replace('cmH2O', ''))
    epoc = int(parts[3].replace('mL.csv', ''))
    return peep, epoc

In [3]:
# Definir y evaluar modelos de ML
def evaluate_model(model, param_grid, X_train, X_test, y_train, y_test):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
    r2 = r2_score(y_test, y_pred, multioutput='raw_values')
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R^2 Score: {r2}\n")

In [4]:
# Ruta de la carpeta con los archivos CSV
folder_path = r"C:\Users\ldani\Documents\Patronus\Project\simulated-obstructive-disease-respiratory-pressure-and-flow-1.0.0\PQ_ProcessedData"

# Leer todos los archivos CSV
data = []
labels = []
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        filepath = os.path.join(folder_path, file)
        df = pd.read_csv(filepath)
        
        # Normalizar las señales
        scaler = StandardScaler()
        df[['Pressure [cmH2O]', 'Flow [L/s]', 'V_tidal [L]']] = scaler.fit_transform(df[['Pressure [cmH2O]', 'Flow [L/s]', 'V_tidal [L]']])
        
        # Extraer características del archivo CSV
        peep, epoc = extract_peep_epoc(file)
        
        # Crear características adicionales
        mean_pressure = df['Pressure [cmH2O]'].mean()
        std_pressure = df['Pressure [cmH2O]'].std()
        mean_flow = df['Flow [L/s]'].mean()
        std_flow = df['Flow [L/s]'].std()
        mean_v_tidal = df['V_tidal [L]'].mean()
        std_v_tidal = df['V_tidal [L]'].std()
        
        features = [mean_pressure, std_pressure, mean_flow, std_flow, mean_v_tidal, std_v_tidal]
        
        data.append(features)
        labels.append([peep, epoc])

# Convertir a DataFrame
data = np.array(data)
labels = np.array(labels)


In [5]:
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [6]:
# RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
evaluate_model(rf_model, rf_param_grid, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Model: RandomForestRegressor
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error (MSE): [9.43790545e+00 1.28957276e+04]
Root Mean Squared Error (RMSE): [  3.07211742 113.55935724]
Mean Absolute Error (MAE): [ 2.38744778 88.27159023]
R^2 Score: [-0.01120416 -0.06151327]



In [7]:
# GradientBoostingRegressor
gbr_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gbr_param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [3, 5, 7],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__subsample': [0.7, 0.8, 1.0]
}
evaluate_model(gbr_model, gbr_param_grid, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Model: MultiOutputRegressor
Best Parameters: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 100, 'estimator__subsample': 0.7}
Mean Squared Error (MSE): [9.39605841e+00 1.25999546e+04]
Root Mean Squared Error (RMSE): [  3.06529907 112.24951919]
Mean Absolute Error (MAE): [ 2.37918568 88.10249902]
R^2 Score: [-0.00672054 -0.03716668]



In [8]:
# SVR
svr_model = MultiOutputRegressor(SVR())
svr_param_grid = {
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__C': [0.1, 1, 10],
    'estimator__epsilon': [0.01, 0.1, 0.2]
}
evaluate_model(svr_model, svr_param_grid, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Model: MultiOutputRegressor
Best Parameters: {'estimator__C': 10, 'estimator__epsilon': 0.01, 'estimator__kernel': 'linear'}
Mean Squared Error (MSE): [9.33333335e+00 1.68736260e+04]
Root Mean Squared Error (RMSE): [  3.05505047 129.89852199]
Mean Absolute Error (MAE): [ 2.33333397 87.49875086]
R^2 Score: [-1.27061428e-09 -3.88954424e-01]



In [9]:
# XGBRegressor with RandomizedSearchCV
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalización de datos
    ('model', MultiOutputRegressor(XGBRegressor(random_state=42)))
])

param_dist = {
    'model__estimator__n_estimators': randint(100, 1000),
    'model__estimator__max_depth': randint(3, 10),
    'model__estimator__learning_rate': uniform(0.01, 0.3),
    'model__estimator__subsample': uniform(0.7, 0.3),
    'model__estimator__colsample_bytree': uniform(0.7, 0.3)
}

random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='raw_values')

print(f"XGBRegressor with RandomizedSearchCV")
print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
XGBRegressor with RandomizedSearchCV
Best Parameters: {'model__estimator__colsample_bytree': 0.8834959481464842, 'model__estimator__learning_rate': 0.012119891565915222, 'model__estimator__max_depth': 3, 'model__estimator__n_estimators': 660, 'model__estimator__subsample': 0.8574323980775167}
Mean Squared Error (MSE): [1.16383325e+01 1.31991043e+04]
Root Mean Squared Error (RMSE): [  3.41150003 114.88735477]
Mean Absolute Error (MAE): [ 2.9487815  88.98177783]
R^2 Score: [-0.24696419 -0.08648575]
