In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# Cargar datos
df = pd.read_csv('/content/dataset_fc.csv')  #  archivo

# Convertir la columna objetivo a datos numéricos
label_encoder = LabelEncoder()
df['dx_holter_final'] = label_encoder.fit_transform(df['dx_holter_final'])  # 0 para 'arritmia', 1 para 'normal'


X = df.drop(columns=['dx_holter_final']).values
y = df['dx_holter_final'].values

# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Balancear los datos antes de dividir
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Dividir los datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

# Normalizar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Definir el modelo Random Forest
rf = RandomForestClassifier(random_state=42)

# Definir los hiperparámetros a probar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), verbose=2, n_jobs=-1, scoring='f1')

# Ajustar GridSearchCV a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Imprimir los mejores hiperparámetros encontrados
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_:.4f}')

# Evaluar el modelo final con los mejores hiperparámetros
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Final Model Accuracy: {accuracy:.4f}')
print(f'Final Model Recall: {recall:.4f}')
print(f'Final Model F1 Score: {f1:.4f}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

from sklearn.metrics import confusion_matrix, recall_score

# Realiza las predicciones
y_pred = best_rf.predict(X_test)

# Calcular la matriz de confusión
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

# Calcular sensibilidad (recall)
sensitivity = recall_score(y_test, y_pred)
print(f'Sensibilidad (Recall): {sensitivity:.4f}')

# Calcular especificidad
specificity = tn / (tn + fp)
print(f'Especificidad: {specificity:.4f}')


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score: 0.8515
Final Model Accuracy: 0.8919
Final Model Recall: 0.8378
Final Model F1 Score: 0.8857
              precision    recall  f1-score   support

    ARRITMIA       0.85      0.95      0.90        37
      NORMAL       0.94      0.84      0.89        37

    accuracy                           0.89        74
   macro avg       0.90      0.89      0.89        74
weighted avg       0.90      0.89      0.89        74

Sensibilidad (Recall): 0.8378
Especificidad: 0.9459
