In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier

# Cargar datos de entrenamiento
df = pd.read_csv('/content/dataset_fc.csv')

# Convertir la columna objetivo a datos numéricos
label_encoder = LabelEncoder()
df['dx_holter_final'] = label_encoder.fit_transform(df['dx_holter_final'])  # 0 para 'arritmia', 1 para 'normal'

X = df.drop(columns=['dx_holter_final']).values
y = df['dx_holter_final'].values

# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Balancear los datos antes de dividir
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Dividir los datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

# Normalizar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Definir el modelo Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Definir los hiperparámetros a probar
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), verbose=2, n_jobs=-1, scoring='f1')

# Ajustar GridSearchCV a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Imprimir los mejores hiperparámetros encontrados
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_:.4f}')

# Evaluar el modelo final con los mejores hiperparámetros
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Final Model Accuracy: {accuracy:.4f}')
print(f'Final Model Recall: {recall:.4f}')
print(f'Final Model F1 Score: {f1:.4f}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Calcular la matriz de confusión
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Calcular especificidad
specificity = tn / (tn + fp)
print(f'Final Model Specificity: {specificity:.4f}')

# ------------------------------
# Validar el conjunto de datos adicional
# ------------------------------

print ("----------------------------------------------------")

# Cargar datos de validación
df_val = pd.read_csv('/content/validation.csv')  # Ajusta la ruta según el nombre de tu archivo

# Convertir la columna objetivo a datos numéricos
df_val['dx_holter_final'] = label_encoder.transform(df_val['dx_holter_final'])  # Usar transform para mantener las mismas etiquetas

X_val = df_val.drop(columns=['dx_holter_final']).values
y_val = df_val['dx_holter_final'].values

# Imputar valores faltantes en el conjunto de validación
X_val = imputer.transform(X_val)

# Normalizar los datos de validación
X_val = scaler.transform(X_val)

# Predecir con el modelo entrenado
y_val_pred = best_dt.predict(X_val)

# Calcular métricas de rendimiento para el conjunto de validación
val_accuracy = accuracy_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Validation Recall: {val_recall:.4f}')
print(f'Validation F1 Score: {val_f1:.4f}')
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

# Calcular la matriz de confusión para el conjunto de validación
cm_val = confusion_matrix(y_val, y_val_pred)

#print(cm_val)
tn_val, fp_val, fn_val, tp_val = cm_val.ravel()

# Calcular especificidad para el conjunto de validación
val_specificity = tn_val / (tn_val + fp_val)
print(f'Validation Specificity: {val_specificity:.4f}')


Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Best parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
Best score: 0.7925
Final Model Accuracy: 0.9324
Final Model Recall: 0.9189
Final Model F1 Score: 0.9315
              precision    recall  f1-score   support

    ARRITMIA       0.92      0.95      0.93        37
      NORMAL       0.94      0.92      0.93        37

    accuracy                           0.93        74
   macro avg       0.93      0.93      0.93        74
weighted avg       0.93      0.93      0.93        74

Final Model Specificity: 0.9459
----------------------------------------------------
Validation Accuracy: 0.7083
Validation Recall: 0.9167
Validation F1 Score: 0.7586
              precision    recall  f1-score   support

    ARRITMIA       0.86      0.50      0.63        12
      NORMAL       0.65      0.92      0.76        12

    accuracy             