In [24]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn import metrics

from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, ADASYN

from pathlib import Path
from utils.paths import DATA_RAW_DIR

In [4]:
# path

path_pabellon = DATA_RAW_DIR / "w6_post_pabellon.xlsx"

print(f"Reading data from {path_pabellon}")
print(Path(path_pabellon).exists())

Reading data from /Users/jasonssdev/Dev/Learning/UC/mcd-machine-learning/data/raw/w6_post_pabellon.xlsx
True


In [5]:
# load 
df_pabellon = pd.read_excel(path_pabellon, sheet_name="Datos")
df_pabellon.head()

Unnamed: 0,EDAD,DIABETES,HOSPITALIZACIÓN ULTIMO MES,PSA,BIOPSIAS PREVIAS,VOLUMEN PROSTATICO,ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS,NUMERO DE MUESTRAS TOMADAS,CUP,ENF. CRONICA PULMONAR OBSTRUCTIVA,BIOPSIA,HOSPITALIZACION
0,53,0,0,4.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0,1
1,56,0,0,7.7,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0,1
2,53,0,0,7.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0,1
3,65,0,0,4.3,0,0,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0,1
4,62,0,0,7.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0,1


In [6]:
df_pabellon.shape

(568, 12)

In [7]:
# split data into X and y
X = df_pabellon.drop(columns=["HOSPITALIZACION"], axis=1)
y = df_pabellon["HOSPITALIZACION"]

In [8]:
X

Unnamed: 0,EDAD,DIABETES,HOSPITALIZACIÓN ULTIMO MES,PSA,BIOPSIAS PREVIAS,VOLUMEN PROSTATICO,ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS,NUMERO DE MUESTRAS TOMADAS,CUP,ENF. CRONICA PULMONAR OBSTRUCTIVA,BIOPSIA
0,53,0,0,4.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0
1,56,0,0,7.7,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0
2,53,0,0,7.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0
3,65,0,0,4.3,0,0,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0
4,62,0,0,7.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
563,57,0,0,4.8,0,0,OTROS,12,0,0,0
564,75,0,0,75.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,1
565,78,0,0,9.3,0,1,CEFALOSPORINA_AMINOGLUCOCIDO,12,0,0,0
566,67,0,0,6.0,0,1,FLUOROQUINOLONA_AMINOGLICOSIDO,12,0,0,1


In [9]:
df_pabellon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 12 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   EDAD                                     568 non-null    int64  
 1   DIABETES                                 568 non-null    int64  
 2   HOSPITALIZACIÓN ULTIMO MES               568 non-null    int64  
 3   PSA                                      568 non-null    float64
 4   BIOPSIAS PREVIAS                         568 non-null    int64  
 5   VOLUMEN PROSTATICO                       568 non-null    int64  
 6   ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS  568 non-null    object 
 7   NUMERO DE MUESTRAS TOMADAS               568 non-null    int64  
 8   CUP                                      568 non-null    int64  
 9   ENF. CRONICA PULMONAR OBSTRUCTIVA        568 non-null    int64  
 10  BIOPSIA                                  568 non-n

In [10]:
df_pabellon['ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS'].unique()

array(['FLUOROQUINOLONA_AMINOGLICOSIDO', 'CEFALOSPORINA_AMINOGLUCOCIDO',
       'OROQUINOLONAS', 'OTROS'], dtype=object)

In [16]:
df_pabellon.columns

Index(['EDAD', 'DIABETES', 'HOSPITALIZACIÓN ULTIMO MES', 'PSA',
       'BIOPSIAS PREVIAS', 'VOLUMEN PROSTATICO',
       'ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS', 'NUMERO DE MUESTRAS TOMADAS',
       'CUP', 'ENF. CRONICA PULMONAR OBSTRUCTIVA', 'BIOPSIA',
       'HOSPITALIZACION'],
      dtype='object')

In [12]:
# dummy and drop first
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,EDAD,DIABETES,HOSPITALIZACIÓN ULTIMO MES,PSA,BIOPSIAS PREVIAS,VOLUMEN PROSTATICO,NUMERO DE MUESTRAS TOMADAS,CUP,ENF. CRONICA PULMONAR OBSTRUCTIVA,BIOPSIA,ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_FLUOROQUINOLONA_AMINOGLICOSIDO,ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_OROQUINOLONAS,ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_OTROS
0,53,0,0,4.0,0,1,12,0,0,0,True,False,False
1,56,0,0,7.7,0,1,12,0,0,0,True,False,False
2,53,0,0,7.0,0,1,12,0,0,0,True,False,False
3,65,0,0,4.3,0,0,12,0,0,0,True,False,False
4,62,0,0,7.0,0,1,12,0,0,0,True,False,False


In [14]:
X.columns

Index(['EDAD', 'DIABETES', 'HOSPITALIZACIÓN ULTIMO MES', 'PSA',
       'BIOPSIAS PREVIAS', 'VOLUMEN PROSTATICO', 'NUMERO DE MUESTRAS TOMADAS',
       'CUP', 'ENF. CRONICA PULMONAR OBSTRUCTIVA', 'BIOPSIA',
       'ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_FLUOROQUINOLONA_AMINOGLICOSIDO',
       'ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_OROQUINOLONAS',
       'ANTIBIOTICO UTILIAZADO EN LA PROFILAXIS_OTROS'],
      dtype='object')

In [17]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [18]:
# stratified k fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [19]:
# models instances
model_svc = SVC(kernel='linear')
model_rf = RandomForestClassifier(random_state=123,n_estimators=30, max_depth=5)
model_gnb = GaussianNB()

In [30]:
# function to cross validation




def cv(X, y):
    recall_svc = cross_val_score(model_svc, X, y, cv=skf, scoring='recall')
    recall_rf = cross_val_score(model_rf, X, y, cv=skf, scoring='recall')
    recall_gnb = cross_val_score(model_gnb, X, y, cv=skf, scoring='recall')

    precision_svc = cross_val_score(model_svc, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))
    precision_rf = cross_val_score(model_rf, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))
    precision_gnb = cross_val_score(model_gnb, X, y, cv=skf, scoring=metrics.make_scorer(metrics.precision_score, zero_division=0))

    accuracy_svc = cross_val_score(model_svc, X, y, cv=skf, scoring='accuracy')
    accuracy_rf = cross_val_score(model_rf, X, y, cv=skf, scoring='accuracy')
    accuracy_gnb = cross_val_score(model_gnb, X, y, cv=skf, scoring='accuracy')

    f1_svc = cross_val_score(model_svc, X, y, cv=skf, scoring='f1')
    f1_rf = cross_val_score(model_rf, X, y, cv=skf, scoring='f1')
    f1_gnb = cross_val_score(model_gnb, X, y, cv=skf, scoring='f1')

    results = pd.DataFrame({
        'Model': ['SVC', 'Random Forest', 'GaussianNB'],
        'Accuracy Mean': [accuracy_svc.mean(), accuracy_rf.mean(), accuracy_gnb.mean()],
        'Accuracy Std': [accuracy_svc.std(), accuracy_rf.std(), accuracy_gnb.std()],
        'Precision Mean': [precision_svc.mean(), precision_rf.mean(), precision_gnb.mean()],
        'Precision Std': [precision_svc.std(), precision_rf.std(), precision_gnb.std()],
        'Recall Mean': [recall_svc.mean(), recall_rf.mean(), recall_gnb.mean()],
        'Recall Std': [recall_svc.std(), recall_rf.std(), recall_gnb.std()],
        'F1 Mean': [f1_svc.mean(), f1_rf.mean(), f1_gnb.mean()],
        'F1 Std': [f1_svc.std(), f1_rf.std(), f1_gnb.std()],
    })

    return results


In [31]:
cv_results = cv(X_train, y_train)
cv_results

Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std
0,SVC,0.957215,0.005891,0.0,0.0,0.0,0.0,0.0,0.0
1,Random Forest,0.959747,0.009203,0.2,0.4,0.066667,0.133333,0.1,0.2
2,GaussianNB,0.211266,0.053456,0.049126,0.008096,0.95,0.1,0.09324,0.014557


In [35]:
# test

def test_set(X,y):
    model_svc.fit(X, y)
    model_rf.fit(X, y)
    model_gnb.fit(X, y)

    y_pred_svc = model_svc.predict(X_test)
    y_pred_rf = model_rf.predict(X_test)
    y_pred_gnb = model_gnb.predict(X_test)

    accuracy_svc = accuracy_score(y_test, y_pred_svc)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    accuracy_gnb = accuracy_score(y_test, y_pred_gnb)

    precision_svc = metrics.precision_score(y_test, y_pred_svc, zero_division=0)
    precision_rf = metrics.precision_score(y_test, y_pred_rf, zero_division=0)
    precision_gnb = metrics.precision_score(y_test, y_pred_gnb, zero_division=0)

    recall_svc = metrics.recall_score(y_test, y_pred_svc)
    recall_rf = metrics.recall_score(y_test, y_pred_rf)
    recall_gnb = metrics.recall_score(y_test, y_pred_gnb)

    f1_svc = metrics.f1_score(y_test, y_pred_svc)
    f1_rf = metrics.f1_score(y_test, y_pred_rf)
    f1_gnb = metrics.f1_score(y_test, y_pred_gnb)

    results = pd.DataFrame({
        'Model': ['SVC', 'Random Forest', 'GaussianNB'],
        'Accuracy': [accuracy_svc, accuracy_rf, accuracy_gnb],
        'Precision': [precision_svc, precision_rf, precision_gnb],
        'Recall': [recall_svc, recall_rf, recall_gnb],
        'F1 Score': [f1_svc, f1_rf, f1_gnb],
    })

    return results

In [29]:
test_results = test_set(X_test, y_test)
test_results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVC,0.959064,0.0,0.0,0.0
1,Random Forest,0.959064,0.0,0.0,0.0
2,GaussianNB,0.169591,0.040816,0.857143,0.077922


In [33]:
X_train.shape, y_train.shape

((397, 13), (397,))

In [32]:
# Oversampling
oversample = RandomOverSampler(sampling_strategy='minority', random_state=123)

X_train_ros, y_train_ros = oversample.fit_resample(X_train, y_train)
X_train_ros.shape, y_train_ros.shape

((760, 13), (760,))

In [36]:
# cross validation with oversampling
cv_results_ros = cv(X_train_ros, y_train_ros)
cv_results_ros

Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std
0,SVC,0.788158,0.017356,0.702836,0.017742,1.0,0.0,0.825363,0.012062
1,Random Forest,0.952632,0.013418,0.913998,0.021883,1.0,0.0,0.954929,0.012081
2,GaussianNB,0.577632,0.024756,0.542487,0.015057,1.0,0.0,0.70327,0.012507


In [37]:
test_results_ros = test_set(X_train_ros, y_train_ros)
test_results_ros

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVC,0.508772,0.067416,0.857143,0.125
1,Random Forest,0.865497,0.192308,0.714286,0.30303
2,GaussianNB,0.169591,0.040816,0.857143,0.077922


In [38]:
# undersampling
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=123)

X_train_rus, y_train_rus = undersample.fit_resample(X_train, y_train)
X_train_rus.shape, y_train_rus.shape

((34, 13), (34,))

In [39]:
# cross validation with undersampling
cv_results_rus = cv(X_train_rus, y_train_rus)
cv_results_rus

Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Precision Mean,Precision Std,Recall Mean,Recall Std,F1 Mean,F1 Std
0,SVC,0.404762,0.159364,0.43,0.143914,0.483333,0.169967,0.442857,0.130931
1,Random Forest,0.552381,0.177153,0.46,0.257682,0.566667,0.326599,0.506349,0.286788
2,GaussianNB,0.433333,0.147849,0.364286,0.22159,0.55,0.34801,0.435931,0.266392


In [40]:
test_results_rus = test_set(X_train_rus, y_train_rus)
test_results_rus 

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVC,0.415205,0.065421,1.0,0.122807
1,Random Forest,0.695906,0.090909,0.714286,0.16129
2,GaussianNB,0.134503,0.045161,1.0,0.08642


In [41]:
# combination of over and under sampling

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=123)
X_train_ros, y_train_ros = oversample.fit_resample(X_train, y_train)
print(X_train_ros.shape, y_train_ros.shape)

undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=123)
X_train_rus, y_train_rus = undersample.fit_resample(X_train, y_train)
print(X_train_rus.shape, y_train_rus.shape)

(570, 13) (570,)
(51, 13) (51,)


In [42]:
# cross validation combination
cv_results_rus = cv(X_train_rus, y_train_rus)
cv_results_ros = cv(X_train_ros, y_train_ros)
cv_results_rus, cv_results_ros


(           Model  Accuracy Mean  Accuracy Std  ...  Recall Std   F1 Mean    F1 Std
 0            SVC       0.607273      0.064897  ...    0.145297  0.133333  0.163299
 1  Random Forest       0.667273      0.133187  ...    0.293447  0.476190  0.215078
 2     GaussianNB       0.332727      0.041699  ...    0.100000  0.483590  0.028991
 
 [3 rows x 9 columns],
            Model  Accuracy Mean  Accuracy Std  ...  Recall Std   F1 Mean    F1 Std
 0            SVC       0.721053      0.044866  ...    0.085840  0.476506  0.094244
 1  Random Forest       0.940351      0.025664  ...    0.071393  0.904773  0.042579
 2     GaussianNB       0.436842      0.033009  ...    0.000000  0.542487  0.015057
 
 [3 rows x 9 columns])

In [49]:
test_results_ros = test_set(X_train_ros, y_train_ros)
print(test_results_ros)
print('='*58)
test_results_rus = test_set(X_train_rus, y_train_rus)
print(test_results_rus)


           Model  Accuracy  Precision    Recall  F1 Score
0            SVC  0.842105   0.083333  0.285714  0.129032
1  Random Forest  0.964912   0.666667  0.285714  0.400000
2     GaussianNB  0.169591   0.040816  0.857143  0.077922
           Model  Accuracy  Precision    Recall  F1 Score
0            SVC  0.847953   0.086957  0.285714  0.133333
1  Random Forest  0.888889   0.125000  0.285714  0.173913
2     GaussianNB  0.163743   0.046667  1.000000  0.089172
