In [1]:
# Módulos básicos para análisis y manipulación de datos
import numpy as np
import pandas as pd

# Modelos de clasificación
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

# Módulos para evaluación de modelos
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# Módulos para el balanceo de datos
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, ADASYN

In [2]:
from utils.paths import DATA_RAW_DIR
from pathlib import Path

In [3]:
path_data = DATA_RAW_DIR / 'post_pabellon.xlsx'
print(Path(path_data).exists())

True


1. 

In [4]:
datos = None      # Variable que debe modificar
y = None          # Variable que debe modificar
X = None          # Variable que debe modificar

# your code here

# cargar dataframe
datos = pd.read_excel(path_data, sheet_name='Datos')

# separar variable objetivo
X = datos.drop(columns=['HOSPITALIZACION'], axis=1)
y = datos['HOSPITALIZACION']

# crear variables dummies
X = pd.get_dummies(X, drop_first=True)


2. 

In [5]:
# dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

3. 

In [6]:
SVC_model = SVC(C=15, kernel='poly', degree=2)
RF_model = RandomForestClassifier(
    n_estimators=50, max_depth=10, random_state=123, max_samples=0.8, max_features='log2')
NB_model = BernoulliNB(class_prior=[0.5, 0.5])

# your code here

# stratified k fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

# funcion de cross vaalidation


def cv(X, y):

    recall_svc = cross_val_score(SVC_model, X, y, cv=skf, scoring='recall')
    recall_rf = cross_val_score(RF_model, X, y, cv=skf, scoring='recall')
    recall_gnb = cross_val_score(NB_model, X, y, cv=skf, scoring='recall')

    precision_svc = cross_val_score(SVC_model, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))

    precision_rf = cross_val_score(RF_model, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))
    precision_gnb = cross_val_score(NB_model, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))

    accuracy_svc = cross_val_score(SVC_model, X, y, cv=skf, scoring='accuracy')
    accuracy_rf = cross_val_score(RF_model, X, y, cv=skf, scoring='accuracy')
    accuracy_gnb = cross_val_score(NB_model, X, y, cv=skf, scoring='accuracy')

    f1_svc = cross_val_score(SVC_model, X, y, cv=skf, scoring='f1')
    f1_rf = cross_val_score(RF_model, X, y, cv=skf, scoring='f1')
    f1_gnb = cross_val_score(NB_model, X, y, cv=skf, scoring='f1')

    auc_svc = cross_val_score(SVC_model, X, y, cv=skf, scoring='roc_auc')
    auc_rf = cross_val_score(RF_model, X, y, cv=skf, scoring='roc_auc')
    auc_gnb = cross_val_score(NB_model, X, y, cv=skf, scoring='roc_auc')

    cv_results = pd.DataFrame(
        index=['SV', 'RF', 'NB'],
        columns=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC'])

    cv_results.loc['SV'] = [
        accuracy_svc.mean(),
        precision_svc.mean(),
        recall_svc.mean(),
        f1_svc.mean(),
        auc_svc.mean()
    ]
    cv_results.loc['RF'] = [
        accuracy_rf.mean(),
        precision_rf.mean(),
        recall_rf.mean(),
        f1_rf.mean(),
        auc_rf.mean()
    ]
    cv_results.loc['NB'] = [
        accuracy_gnb.mean(),
        precision_gnb.mean(),
        recall_gnb.mean(),
        f1_gnb.mean(),
        auc_gnb.mean()
    ]

    return cv_results

In [7]:
cv_results = cv(X_train, y_train)
cv_results

Unnamed: 0,Accuracy,Precision,Recall,F1,AUC
SV,0.957215,0.0,0.0,0.0,0.735307
RF,0.959747,0.3,0.133333,0.18,0.855702
NB,0.838829,0.051496,0.183333,0.078941,0.619408


5. 

In [8]:
def test_set(X,y):

    SVC_model = SVC(C=15, kernel='poly', degree=2)
    RF_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=123, max_samples=0.8, max_features='log2')
    NB_model = BernoulliNB(class_prior=[0.5, 0.5])

    SVC_model.fit(X, y)
    RF_model.fit(X, y)
    NB_model.fit(X, y)

    y_pred_svc = SVC_model.predict(X_test)
    y_pred_rf = RF_model.predict(X_test)
    y_pred_gnb = NB_model.predict(X_test)

    accuracy_svc = metrics.accuracy_score(y_test, y_pred_svc)
    accuracy_rf = metrics.accuracy_score(y_test, y_pred_rf)
    accuracy_gnb = metrics.accuracy_score(y_test, y_pred_gnb)

    precision_svc = metrics.precision_score(y_test, y_pred_svc, zero_division=0)
    precision_rf = metrics.precision_score(y_test, y_pred_rf, zero_division=0)
    precision_gnb = metrics.precision_score(y_test, y_pred_gnb, zero_division=0)

    recall_svc = metrics.recall_score(y_test, y_pred_svc,zero_division=0)
    recall_rf = metrics.recall_score(y_test, y_pred_rf, zero_division=0)
    recall_gnb = metrics.recall_score(y_test, y_pred_gnb, zero_division=0)

    f1_svc = metrics.f1_score(y_test, y_pred_svc, zero_division=0)
    f1_rf = metrics.f1_score(y_test, y_pred_rf, zero_division=0)
    f1_gnb = metrics.f1_score(y_test, y_pred_gnb, zero_division=0)
    
    auc_svc = metrics.roc_auc_score(y_test, y_pred_svc)
    auc_rf = metrics.roc_auc_score(y_test, y_pred_rf)
    auc_gnb = metrics.roc_auc_score(y_test, y_pred_gnb)
    
    test_results = pd.DataFrame(index=['SV', 'RF', 'NB'], columns=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC'])
    test_results.loc['SV'] = [accuracy_svc, precision_svc, recall_svc, f1_svc, auc_svc]
    test_results.loc['RF'] = [accuracy_rf, precision_rf, recall_rf, f1_rf, auc_rf]
    test_results.loc['NB'] = [accuracy_gnb, precision_gnb, recall_gnb, f1_gnb, auc_gnb]

    return test_results
    
test_results = test_set(X_train, y_train)
test_results

Unnamed: 0,Accuracy,Precision,Recall,F1,AUC
SV,0.959064,0.0,0.0,0.0,0.5
RF,0.959064,0.0,0.0,0.0,0.5
NB,0.853801,0.090909,0.285714,0.137931,0.581882


6. 

In [9]:
# over_sampling
oversample = RandomOverSampler(sampling_strategy='minority', random_state=123)

X_train_ros, y_train_ros = oversample.fit_resample(X_train, y_train)

cv_results_balanced = cv(X_train_ros, y_train_ros)
print(cv_results_balanced)

test_results_balanced = test_set(X_train_ros, y_train_ros)
print(test_results_balanced)

    Accuracy Precision    Recall        F1       AUC
SV  0.651316  0.589468       1.0  0.741624  0.691118
RF  0.988158  0.976954       1.0   0.98832  0.998979
NB  0.664474  0.650265  0.718421  0.682299  0.757185
    Accuracy Precision    Recall        F1       AUC
SV  0.345029  0.058824       1.0  0.111111  0.658537
RF   0.94152  0.333333  0.428571     0.375  0.695993
NB  0.555556      0.04  0.428571  0.073171  0.494774


In [10]:
# under_sampling
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=123)

X_train_rus, y_train_rus = undersample.fit_resample(X_train, y_train)

cv_results_balanced2 = cv(X_train_rus, y_train_rus)
print(cv_results_balanced2)

test_results_balanced2 = test_set(X_train_rus, y_train_rus)
print(test_results_balanced2)

    Accuracy Precision    Recall        F1       AUC
SV  0.466667  0.419048  0.483333  0.415238       0.4
RF  0.609524  0.497619  0.733333  0.591169  0.616667
NB   0.37619      0.42  0.516667  0.450794  0.236111
    Accuracy Precision    Recall        F1       AUC
SV  0.345029  0.051282  0.857143  0.096774  0.590157
RF  0.777778  0.121951  0.714286  0.208333  0.747387
NB  0.526316  0.059524  0.714286   0.10989  0.616289


In [11]:
# NearMiss
nm = NearMiss(sampling_strategy='majority')
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)
cv_results_balanced_nm = cv(X_train_nm, y_train_nm)
test_results_balanced_nm = test_set(X_train_nm, y_train_nm)

# ADASYN
ad = ADASYN(random_state=123, sampling_strategy='minority')
X_train_ad, y_train_ad = ad.fit_resample(X_train, y_train)
cv_results_balanced_ad = cv(X_train_ad, y_train_ad)
test_results_balanced_ad = test_set(X_train_ad, y_train_ad)

# Combinación de RandomOverSampler y RandomUnderSampler
ros = RandomOverSampler(random_state=123, sampling_strategy=0.5)
rus = RandomUnderSampler(random_state=123, sampling_strategy=0.5)
X_train_comb, y_train_comb = ros.fit_resample(X_train, y_train)
X_train_comb, y_train_comb = rus.fit_resample(X_train_comb, y_train_comb)
cv_results_balanced_comb = cv(X_train_comb, y_train_comb)
test_results_balanced_comb = test_set(X_train_comb, y_train_comb)

print('NearMiss - CV Results')
print(cv_results_balanced_nm)
print('NearMiss - Test Results')
print(test_results_balanced_nm)

print('ADASYN - CV Results')
print(cv_results_balanced_ad)
print('ADASYN - Test Results')
print(test_results_balanced_ad)

print('Combinación ROS+RUS - CV Results')
print(cv_results_balanced_comb)
print('Combinación ROS+RUS - Test Results')
print(test_results_balanced_comb)

NearMiss - CV Results
    Accuracy Precision    Recall        F1       AUC
SV  0.609524  0.616667  0.466667  0.496667  0.694444
RF  0.638095  0.733333       0.4  0.484762  0.752778
NB  0.466667       0.3       0.1  0.146667  0.469444
NearMiss - Test Results
    Accuracy Precision    Recall        F1       AUC
SV  0.362573  0.044643  0.714286  0.084034  0.530923
RF  0.467836  0.053191  0.714286   0.09901  0.585801
NB  0.748538  0.071429  0.428571  0.122449  0.595383
ADASYN - CV Results
    Accuracy Precision    Recall        F1       AUC
SV  0.670626  0.610545  0.941965  0.740775  0.809318
RF  0.945974  0.919117  0.978947  0.947704  0.990169
NB  0.782581  0.729111  0.902386  0.805981  0.847236
ADASYN - Test Results
    Accuracy Precision    Recall        F1       AUC
SV   0.45614      0.07       1.0  0.130841  0.716463
RF  0.900585  0.222222  0.571429      0.32  0.743031
NB  0.573099  0.054054  0.571429  0.098765    0.5723
Combinación ROS+RUS - CV Results
    Accuracy Precision    Recal

In [12]:
# Codificación de técnicas de balanceo
# 1: NearMiss, 2: ADASYN, 3: Combinación ROS+RUS
best_model_balancing = None
best_balancing = None

# Extraer el mejor F1-score de cada técnica y modelo
f1_ros = cv_results_balanced['F1']
f1_rus = cv_results_balanced2['F1']
f1_nm = cv_results_balanced_nm['F1']
f1_ad = cv_results_balanced_ad['F1']
f1_comb = cv_results_balanced_comb['F1']

f1_ros = f1_ros.astype(float)
f1_rus = f1_rus.astype(float)
f1_nm = f1_nm.astype(float)
f1_ad = f1_ad.astype(float)
f1_comb = f1_comb.astype(float)


# Guardar los valores máximos y sus índices
max_f1_ros = f1_ros.max()
model_ros = f1_ros.idxmax()
max_f1_rus = f1_rus.max()
model_rus = f1_rus.idxmax()
max_f1_nm = f1_nm.max()
model_nm = f1_nm.idxmax()
max_f1_ad = f1_ad.max()
model_ad = f1_ad.idxmax()
max_f1_comb = f1_comb.max()
model_comb = f1_comb.idxmax()

# Comparar los máximos entre técnicas
f1_scores = [max_f1_ros, max_f1_rus, max_f1_nm, max_f1_ad, max_f1_comb]

models = [model_ros, model_rus, model_nm, model_ad, model_comb]
balancing_codes = [1, 2, 3, 4, 5]
best_idx = int(np.argmax(f1_scores))
best_model_balancing = models[best_idx]
best_balancing = balancing_codes[best_idx]

print(f"Mejor técnica de balanceo: código {best_balancing}")
print(f"Mejor modelo: {best_model_balancing}")

Mejor técnica de balanceo: código 1
Mejor modelo: RF


In [13]:
# Extraer el mejor F1-score de cada técnica y modelo en el dataset de test
f1_ros_test = test_results_balanced['F1'].astype(float)
f1_rus_test = test_results_balanced2['F1'].astype(float)
f1_nm_test = test_results_balanced_nm['F1'].astype(float)
f1_ad_test = test_results_balanced_ad['F1'].astype(float)
f1_comb_test = test_results_balanced_comb['F1'].astype(float)

max_f1_ros_test = f1_ros_test.max()
model_ros_test = f1_ros_test.idxmax()
max_f1_rus_test = f1_rus_test.max()
model_rus_test = f1_rus_test.idxmax()
max_f1_nm_test = f1_nm_test.max()
model_nm_test = f1_nm_test.idxmax()
max_f1_ad_test = f1_ad_test.max()
model_ad_test = f1_ad_test.idxmax()
max_f1_comb_test = f1_comb_test.max()
model_comb_test = f1_comb_test.idxmax()

f1_scores_test = [max_f1_ros_test, max_f1_rus_test, max_f1_nm_test, max_f1_ad_test, max_f1_comb_test]
models_test = [model_ros_test, model_rus_test, model_nm_test, model_ad_test, model_comb_test]
balancing_codes_test = [1, 2, 3, 4, 5]
best_idx_test = int(np.argmax(f1_scores_test))
best_model_balancing_test = models_test[best_idx_test]
best_balancing_test = balancing_codes_test[best_idx_test]

print(f"Mejor técnica de balanceo en test: código {best_balancing_test}")
print(f"Mejor modelo en test: {best_model_balancing_test}")     

Mejor técnica de balanceo en test: código 5
Mejor modelo en test: RF
