In [None]:
# Módulos básicos para análisis y manipulación de datos
import numpy as np
import pandas as pd

# Modelos de clasificación
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Módulos para evaluación de modelos
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# Módulos para el balanceo de datos
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, ADASYN

In [35]:
datos = pd.read_excel('post_pabellon.xlsx', sheet_name='Datos')

# Separar las características de la variable objetivo
y = datos['HOSPITALIZACION']
X = datos.drop(['HOSPITALIZACION'], axis=1)
print(X)


     EDAD  DIABETES  HOSPITALIZACIÓN ULTIMO MES   PSA  BIOPSIAS PREVIAS  \
0      53         0                           0   4.0                 0   
1      56         0                           0   7.7                 0   
2      53         0                           0   7.0                 0   
3      65         0                           0   4.3                 0   
4      62         0                           0   7.0                 0   
..    ...       ...                         ...   ...               ...   
563    57         0                           0   4.8                 0   
564    75         0                           0  75.0                 0   
565    78         0                           0   9.3                 0   
566    67         0                           0   6.0                 0   
567    64         0                           0   4.8                 0   

     VOLUMEN PROSTATICO ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS  \
0                     1         

In [36]:
# Dummy variables para las variables categóricas
X = pd.get_dummies(X, drop_first=True)
print(X)

     EDAD  DIABETES  HOSPITALIZACIÓN ULTIMO MES   PSA  BIOPSIAS PREVIAS  \
0      53         0                           0   4.0                 0   
1      56         0                           0   7.7                 0   
2      53         0                           0   7.0                 0   
3      65         0                           0   4.3                 0   
4      62         0                           0   7.0                 0   
..    ...       ...                         ...   ...               ...   
563    57         0                           0   4.8                 0   
564    75         0                           0  75.0                 0   
565    78         0                           0   9.3                 0   
566    67         0                           0   6.0                 0   
567    64         0                           0   4.8                 0   

     VOLUMEN PROSTATICO  NUMERO DE MUESTRAS TOMADAS  CUP  \
0                     1                

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)

In [37]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [38]:
SVC_model = SVC(kernel='linear')
RF_model = RandomForestClassifier(n_estimators=30, max_depth=5, random_state=123)
NB_model = GaussianNB()

In [20]:
def cv(X, y):
    SVC_recall = cross_val_score(SVC_model, X, y, cv=strat_kfold , scoring= 'recall')
    RF_recall = cross_val_score(RF_model, X, y, cv=strat_kfold , scoring= 'recall')
    NB_recall = cross_val_score(NB_model, X, y, cv=strat_kfold , scoring= 'recall')

    SVC_precision = cross_val_score(SVC_model, X, y, cv=strat_kfold , scoring= metrics.make_scorer(metrics.precision_score, zero_division=0))
    RF_precision = cross_val_score(RF_model, X, y, cv=strat_kfold , scoring= metrics.make_scorer(metrics.precision_score, zero_division=0))
    NB_precision = cross_val_score(NB_model, X, y, cv=strat_kfold , scoring= metrics.make_scorer(metrics.precision_score, zero_division=0))

    SVC_accuracy = cross_val_score(SVC_model, X, y, cv=strat_kfold , scoring= 'accuracy')
    RF_accuracy = cross_val_score(RF_model, X, y, cv=strat_kfold , scoring= 'accuracy')
    NB_accuracy = cross_val_score(NB_model, X, y, cv=strat_kfold , scoring= 'accuracy')

    results = pd.DataFrame(
        {
            'Accuracy': [SVC_accuracy.mean(), RF_accuracy.mean(), NB_accuracy.mean()],
            'Precision': [SVC_precision.mean(), RF_precision.mean(), NB_precision.mean()],
            'Recall': [SVC_recall.mean(), RF_recall.mean(), NB_recall.mean()]
        },
        index=['SV', 'RF', 'NB'])
    return results

In [21]:
cv_results = cv(X_train, y_train)
print(cv_results)

    Accuracy  Precision    Recall
SV  0.957215   0.000000  0.000000
RF  0.959747   0.200000  0.066667
NB  0.211266   0.049126  0.950000


In [22]:
def test_set(X,y):
    SVC_model.fit(X, y)
    RF_model.fit(X, y)
    NB_model.fit(X, y)

    SVC_pred = SVC_model.predict(X_test)
    RF_pred = RF_model.predict(X_test)
    NB_pred = NB_model.predict(X_test)


    SVC_recall = metrics.recall_score(y_test, SVC_pred)
    RF_recall = metrics.recall_score(y_test, RF_pred)
    NB_recall = metrics.recall_score(y_test, NB_pred)

    SVC_precision = metrics.precision_score(y_test, SVC_pred, zero_division=0)
    RF_precision = metrics.precision_score(y_test, RF_pred, zero_division=0)
    NB_precision = metrics.precision_score(y_test, NB_pred, zero_division=0)

    SVC_accuracy = metrics.accuracy_score(y_test, SVC_pred)
    RF_accuracy = metrics.accuracy_score(y_test, RF_pred)
    NB_accuracy = metrics.accuracy_score(y_test, NB_pred)

    results = pd.DataFrame(
        {
            'Accuracy': [SVC_accuracy, RF_accuracy, NB_accuracy],
            'Precision': [SVC_precision, RF_precision, NB_precision],
            'Recall': [SVC_recall, RF_recall, NB_recall]
        },
        index=['SV', 'RF', 'NB'])

    return results

In [39]:
test_results = test_set(X_train, y_train)
print(test_results)

    Accuracy  Precision    Recall
SV  0.959064   0.000000  0.000000
RF  0.959064   0.000000  0.000000
NB  0.169591   0.040816  0.857143


In [24]:
oversample = RandomOverSampler(sampling_strategy='minority', random_state=123)
X_train_os, y_train_os = oversample.fit_resample(X_train, y_train)
cv_results_balanced = cv(X_train_os, y_train_os)



In [25]:
print(cv_results_balanced)

    Accuracy  Precision  Recall
SV  0.788158   0.702836     1.0
RF  0.952632   0.913998     1.0
NB  0.577632   0.542487     1.0


In [26]:
test_results_balanced = test_set(X_train_os, y_train_os)
print(test_results_balanced)

    Accuracy  Precision    Recall
SV  0.508772   0.067416  0.857143
RF  0.865497   0.192308  0.714286
NB  0.169591   0.040816  0.857143


In [27]:
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=123)
X_train_us, y_train_us = undersample.fit_resample(X_train, y_train)
cv_results_balanced2 = cv(X_train_us, y_train_us)
print(cv_results_balanced2)



    Accuracy  Precision    Recall
SV  0.404762   0.430000  0.483333
RF  0.552381   0.460000  0.566667
NB  0.433333   0.364286  0.550000


In [28]:
test_results_balanced = test_set(X_train_us, y_train_us)
print(test_results_balanced)

    Accuracy  Precision    Recall
SV  0.415205   0.065421  1.000000
RF  0.695906   0.090909  0.714286
NB  0.134503   0.045161  1.000000


In [29]:
oversample = RandomOverSampler(sampling_strategy=0.5, random_state=123)
X_train_os, y_train_os = oversample.fit_resample(X_train, y_train)

undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=123)
X_train_us, y_train_us = undersample.fit_resample(X_train_os, y_train_os)



In [30]:
cv_results_balanced_comb = cv(X_train_us, y_train_us)
test_results_balanced_comb = test_set(X_train_us, y_train_us)

In [31]:
print(cv_results_balanced_comb)

    Accuracy  Precision    Recall
SV  0.710526   0.635226  0.342105
RF  0.950877   0.972581  0.878947
NB  0.436842   0.372012  1.000000


In [32]:
print(test_results_balanced_comb)

    Accuracy  Precision    Recall
SV  0.842105   0.083333  0.285714
RF  0.964912   0.600000  0.428571
NB  0.169591   0.040816  0.857143
