In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import warnings

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from test_data import local_test

warnings.filterwarnings('ignore')

In [2]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [3]:
# Se separan características de etiquetas
features = data.iloc[:, 0:-1]
target = data.iloc[:, -1]

In [4]:
# Convertimos en numnpy Array
features = np.array(features)
target = np.array(target)

 # Modelo sin hacer sobre y sub muestreo con 70% de entrenamiento y 30% de validación

In [5]:
# Número de muestras
print(features.shape)
print(target.shape)
print(len(data[data.classes == 0]))
print(len(data[data.classes == 1]))

(3152, 23)
(3152,)
2864
288


In [12]:
# Normalizamos los datos de entrenamientohttp://localhost:8888/notebooks/ANN.ipynb#Modelo-sin-hacer-sobre-y-sub-muestreo
features_standard = StandardScaler().fit_transform(features)

# Realiamos la partición del conjunto de datos para entrenamiento y pruebas
x_train, x_test, y_train, y_test = train_test_split(features_standard, target, test_size=.3, random_state=42)

# Random Forest model
model = RandomForestClassifier(n_estimators=20, max_features=20)
model = model.fit(x_train, y_train)

# Realizamos predicciones
y_est_train = model.predict(x_train)
y_est_test = model.predict(x_test)

# Medimos la exactitud de clasificación
score1 = model.score(x_train, y_train)
score2 = model.score(x_test, y_test)

print('-------------------------------------')
print('Entrenamiento: {0}'.format(score1))
print('Validación: {0}'.format(score1))
print('-------------------------------------')

# Medimos Sensibilidad y Especificidad
tp, fp, tn, fn = get_meassure(np.array(y_test), y_est_test)
sensitive = get_sensitive(tp, fn)
specificity = get_specificity(tn, fp)

# Predecimos la muestra local
print(model.predict(local_test))

print('-------------------------------------')
print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
print('-------------------------------------')

-------------------------------------
Entrenamiento: 0.9986400725294651
Validación: 0.9986400725294651
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
-------------------------------------
Sensibilidad: 0.82, Especificidad: 0.9881796690307328
-------------------------------------


Observamos que el modelo se equivoca en la clasificación de las muestras de la clase 1

# Modelo sin hacer sobre y sub muestreo con validación estratificada

In [8]:
# Número de muestras
print(features.shape)
print(target.shape)
print(len(data[data.classes == 0]))
print(len(data[data.classes == 1]))

(3152, 23)
(3152,)
2864
288


In [13]:
trees = [5, 10, 15, 20, 30, 50]
variables = [5, 10, 15, 20, 23]

for n_trees in trees:
    for n_variables in variables:
        
        print('-------------------------------------')
        print('Número de árboles en el bosque: {0}'.format(n_trees))
        print('Número de variables: {0}'.format(n_variables))
        print('-------------------------------------')
        
        fold = 6
        np.random.seed(12345)
        efficiency_train = np.zeros(fold)
        efficiency_validation = np.zeros(fold)

        skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

        j = 0
        for train, test in skf.split(features, target):
            x_train = features[train, :]
            y_train = target[train]
            x_test = features[test, :]
            y_test = target[test]

            # Random Forest model
            model = RandomForestClassifier(n_estimators=n_trees, max_features=n_variables)
            model = model.fit(x_train, y_train)

            # Realizamos predicciones
            y_est_train = model.predict(x_train)
            y_est_test = model.predict(x_test)
            print(model.predict(local_test))

            # Evaluamos las predicciones del modelo con los datos de test
            efficiency_train[j] = np.mean(y_est_train == y_train)
            efficiency_validation[j] = np.mean(y_est_test == y_test)
            
            # Medimos Sensibilidad y Especificidad
            tp, fp, tn, fn = get_meassure(y_test, y_est_test)
            sensitive = get_sensitive(tp, fn)
            specificity = get_specificity(tn, fp)
            
            j += 1

        print('-------------------------------------')
        print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
        print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
        print('-------------------------------------')
        print('-------------------------------------')
        print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
        print('-------------------------------------')

-------------------------------------
Número de árboles en el bosque: 5
Número de variables: 5
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9961294495529609 +- 0.0005985068166226017
Validación: 0.9739845494598346 +- 0.005679759320675608
-------------------------------------
-------------------------------------
Sensibilidad: 0.9166666666666666, Especificidad: 0.9853249475890985
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 5
Número de variables: 10
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9954315685733897 +- 0.000982510576263745
Validación: 0.9743032168507454 +- 0.003

[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9983502940203538 +- 0.0005231469930687013
Validación: 0.9771567384875369 +- 0.002464252569993765
-------------------------------------
-------------------------------------
Sensibilidad: 0.875, Especificidad: 0.9916142557651991
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 20
Número de variables: 5
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[0 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9993020948605943 +- 0.0005114846861527
Validación: 0.9758862936809706 +- 0.00239340659312398
-------------------------------------
-------------------------------------
Sensibilidad: 0.8125, Especificidad: 0.9916142557651991
-

[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9997461526188341 +- 0.000283826211943805
Validación: 0.9777904520490072 +- 0.0062898072869658475
-------------------------------------
-------------------------------------
Sensibilidad: 0.9375, Especificidad: 0.9937106918238994
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 50
Número de variables: 23
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9997461526188341 +- 0.000283826211943805
Validación: 0.9768386746333514 +- 0.005549557306180195
-------------------------------------
-------------------------------------
Sensibilidad: 0.9375, Especificidad: 0.9916142557651

Las muestras se predicen de manera acertiva, aún cuando no se ha hecho sobre ni sub muestreo

# Modelo con sobre y sub muestreo utilizando validación estratificada


In [14]:
# Sobre y sub muestreo

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=380, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 380, 1: 380})


In [15]:
# Número de muestras
print(features.shape)
print(target.shape)
print(np.unique(target, return_counts=True))

(760, 23)
(760,)
(array([0, 1]), array([380, 380]))


In [16]:
trees = [5, 10, 15, 20, 30, 50, 100, 200]
variables = [5, 10, 15, 20, 23]

for n_trees in trees:
    for n_variables in variables:
        
        print('-------------------------------------')
        print('Número de árboles en el bosque: {0}'.format(n_trees))
        print('Número de variables: {0}'.format(n_variables))
        print('-------------------------------------')
        
        fold = 6
        np.random.seed(12345)
        efficiency_train = np.zeros(fold)
        efficiency_validation = np.zeros(fold)

        skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

        j = 0
        for train, test in skf.split(features, target):
            x_train = features[train, :]
            y_train = target[train]
            x_test = features[test, :]
            y_test = target[test]

            # Random Forest model
            model = RandomForestClassifier(n_estimators=n_trees, max_features=n_variables)
            model = model.fit(x_train, y_train)

            # Realizamos predicciones
            y_est_train = model.predict(x_train)
            y_est_test = model.predict(x_test)
            print(model.predict(local_test))

            # Evaluamos las predicciones del modelo con los datos de test
            efficiency_train[j] = np.mean(y_est_train == y_train)
            efficiency_validation[j] = np.mean(y_est_test == y_test)
            
            # Medimos Sensibilidad y Especificidad
            tp, fp, tn, fn = get_meassure(y_test, y_est_test)
            sensitive = get_sensitive(tp, fn)
            specificity = get_specificity(tn, fp)
            
            j += 1

        print('-------------------------------------')
        print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
        print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
        print('-------------------------------------')
        print('-------------------------------------')
        print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
        print('-------------------------------------')

-------------------------------------
Número de árboles en el bosque: 5
Número de variables: 5
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9934213153376193 +- 0.0024835653469421435
Validación: 0.9539310515873017 +- 0.010717842218949895
-------------------------------------
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.9365079365079365
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 5
Número de variables: 10
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9921035818392365 +- 0.004182564335596173
Validación: 0.9696593915343916 +- 0.016

[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9986839303065395 +- 0.0010842928725990253
Validación: 0.9577546296296298 +- 0.02233137760836607
-------------------------------------
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.9365079365079365
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 20
Número de variables: 5
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9978961186758776 +- 0.0014867875198860258
Validación: 0.9710028108465609 +- 0.013555002813718929
-------------------------------------
-------------------------------------
Sensibilidad: 0.9206349206349206, Especif

[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9997371188222924 +- 0.0005878201833595625
Validación: 0.9617435515873017 +- 0.018057495914311713
-------------------------------------
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.9365079365079365
-------------------------------------
-------------------------------------
Número de árboles en el bosque: 50
Número de variables: 23
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9997371188222924 +- 0.0005878201833595625
Validación: 0.963045634920635 +- 0.018782870471207937
-------------------------------------
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.9365079365079365
----------------

Las muestras se clasifican de manera acertiva en general cuando haplicamos sobre y sub muestreo sobre el conjunto de datos