<a href="https://colab.research.google.com/github/jeguns/EP7173/blob/main/Unidad%2003/03_Tratamiento_de_valores_perdidos_TrainingTesting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install fancyimpute &> /dev/null

In [None]:
import pandas as pd
import numpy as np
from fancyimpute import IterativeImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
np.random.seed(42)
data = pd.DataFrame({
    'Feature1': [1, 2, np.nan, 4, 5, np.nan, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
    'Feature2': [5, 3, 4, np.nan, 2, 1, 2, 3, np.nan, 4, 5, 2, 4, 2, 3, 1, 2, 3, 4],
    'Feature3': [10, 9, 8, 7, 6, np.nan, 4, np.nan, 2, 1, 3, 2, 1, 3, 2, 2, 3, 3, 2],
    'Target': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1]
})

data

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=45)

X_train = train_data.drop(columns=['Target'])
y_train = train_data['Target']

X_test = test_data.drop(columns=['Target'])
y_test = test_data['Target']

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

In [None]:
fold = 1
for train_index, val_index in cv.split(X_train):
    print(f"\nFold {fold}")

    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    imp_mice = IterativeImputer(random_state=42, max_iter=10, tol=1e-09)
    imp_mice.fit(X_train_fold) # Se ajusta la imputación solo en el training del fold

    X_train_fold_imputed = pd.DataFrame(imp_mice.transform(X_train_fold), columns=X_train.columns) # se aplica en training
    X_val_fold_imputed = pd.DataFrame(imp_mice.transform(X_val_fold), columns=X_train.columns) # se aplica en validation

    print("\nConjunto de entrenamiento imputado para el pliegue actual:")
    print(X_train_fold_imputed)

    print("\nConjunto de validación imputado para el pliegue actual:")
    print(X_val_fold_imputed)

    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy') # Se configura GridSearch para tuneo de hiperp.
    grid_search.fit(X_train_fold_imputed, y_train_fold) # Entrenar el modelo utilizando el conjunto de entrenamiento imputado
    best_rf = grid_search.best_estimator_ # Se obtiene el mejor modelo de Random Forest encontrado
    y_val_pred = best_rf.predict(X_val_fold_imputed)  # Evaluar el modelo en el conjunto de validación imputado
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    print(f"Accuracy en el conjunto de validación para el Fold {fold}: {accuracy}")
    print(f"Mejores hiperparámetros para el Fold {fold}: {grid_search.best_params_}")

    fold += 1

In [None]:
imp_mice_final = IterativeImputer(random_state=42, max_iter=10)
imp_mice_final.fit(X_train) # fit solo con X_train

In [None]:
X_train_final_imputed = pd.DataFrame(imp_mice_final.transform(X_train), columns=X_train.columns)
X_test_imputed        = pd.DataFrame(imp_mice_final.transform(X_test), columns=X_test.columns)

In [None]:
best_rf_final = RandomForestClassifier(random_state=42, **grid_search.best_params_)
best_rf_final.fit(X_train_final_imputed, y_train)

In [None]:
y_test_pred = best_rf_final.predict(X_test_imputed)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("\nAccuracy en el conjunto de prueba final: ", test_accuracy)

In [None]:
y_test, y_test_pred