In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
datos= pd.read_csv("dataset2_covid_train_balanceado_final.csv")
datos_mejores_7= datos[['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'gender', 'test_indication', 'corona_result']]
datos_mejores_5= datos[['fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'test_indication', 'corona_result']]

datos_mejores_5

Unnamed: 0,fever,sore_throat,shortness_of_breath,head_ache,test_indication,corona_result
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,1,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
126869,1,0,0,0,0,1
126870,1,1,1,0,0,1
126871,0,0,0,0,0,1
126872,0,0,0,0,0,1


In [4]:
#SPLIT MEJORES 7
X = datos_mejores_5.drop(["corona_result"], axis=1)
y = datos_mejores_5["corona_result"]

#X_train_mejores_7, X_test_mejores_7, y_train, y_test = train_test_split(
#    X, y, test_size=0.30, random_state=42
#)

In [6]:
arbol_base = DecisionTreeClassifier(
    max_depth=11, 
    random_state=42
)


clf_5_caracteristicas = AdaBoostClassifier(
    estimator=arbol_base, 
    n_estimators=99,         
    learning_rate=0.022,     
    random_state=42
)

In [7]:
clf_5_caracteristicas.fit(X, y)

In [20]:
datos_validacion = pd.read_csv('dataset2_covid_test_final.csv')

datos_validacion

X_test = datos_validacion.drop(["corona_result", "Unnamed: 0"], axis=1)
y_test = datos_validacion["corona_result"]

X_test

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication
0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
29692,0,0,0,0,0,0,0,0
29693,0,0,0,0,0,0,1,0
29694,0,0,0,0,0,0,1,0
29695,1,0,0,0,0,1,1,0


In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
X_test = X_test[['fever', 'sore_throat', 'shortness_of_breath', 'head_ache', 'test_indication']]


In [26]:

y_pred = clf_5_caracteristicas.predict(X_test)
    
# Predecir Probabilidades (Necesario para ROC AUC estándar)
# AdaBoost da la probabilidad de cada clase. Tomamos la de la clase 1 (Positivo).
y_prob = clf_5_caracteristicas.predict_proba(X_test)[:, 1]
    
# Obtener la Matriz de Confusión
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

acc = accuracy_score(y_test, y_pred)
 
prec = precision_score(y_test, y_pred)
    
sens = recall_score(y_test, y_pred)
    
spec = tn / (tn + fp)
    
f1 = f1_score(y_test, y_pred)
    
auc_paper_formula = (sens+spec) / 2
auc_estandar = roc_auc_score(y_test, y_prob)
    
print("="*60)
print(" REPORTE DE MÉTRICAS)")
print("="*60)
print(f"Accuracy (Exactitud):   {acc*100:.2f}%")
print(f"Precision:              {prec*100:.2f}%")
print(f"Sensitivity (Recall):   {sens*100:.2f}%")
print(f"Specificity:            {spec*100:.2f}%")
print(f"F1-Score:               {f1*100:.2f}%")
print("-"*60)
print(f"AUC (Fórmula Paper Ec.9): {auc_paper_formula*100:.2f}%")
print(f"AUC (ROC Standard):       {auc_estandar*100:.2f}%")
print("="*60)
print("\nMatriz de Confusión:")
print(f"TN: {tn} | FP: {fp}")
print(f"FN: {fn} | TP: {tp}")



 REPORTE DE MÉTRICAS)
Accuracy (Exactitud):   92.82%
Precision:              55.19%
Sensitivity (Recall):   80.04%
Specificity:            94.00%
F1-Score:               65.33%
------------------------------------------------------------
AUC (Fórmula Paper Ec.9): 87.02%
AUC (ROC Standard):       88.29%

Matriz de Confusión:
TN: 25556 | FP: 1631
FN: 501 | TP: 2009


In [None]:
import numpy as np
from sklearn.metrics import f1_score

# Obtener probabilidades
y_prob = clf_5_caracteristicas.predict_proba(X_test)[:, 1]

# Probar umbrales de 0.50 a 0.90
best_thresh = 0.5
best_f1 = 0

print("Buscando el mejor umbral...")
for thresh in np.arange(0.5, 0.95, 0.05):
    y_pred_temp = (y_prob >= thresh).astype(int)
    f1 = f1_score(y_test, y_pred_temp)
    print(f"Umbral {thresh:.2f} -> F1-Score: {f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"\n Mejor Umbral encontrado: {best_thresh}")

Buscando el mejor umbral...
Umbral 0.50 -> F1-Score: 0.6533
Umbral 0.55 -> F1-Score: 0.6533
Umbral 0.60 -> F1-Score: 0.6533
Umbral 0.65 -> F1-Score: 0.6533
Umbral 0.70 -> F1-Score: 0.6533
Umbral 0.75 -> F1-Score: 0.6533
Umbral 0.80 -> F1-Score: 0.7089
Umbral 0.85 -> F1-Score: 0.7089
Umbral 0.90 -> F1-Score: 0.0000

✅ Mejor Umbral encontrado: 0.8000000000000003
