In [8]:
import numpy as np
import pandas as pd
import scipy as sc

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold
from test_data import local_test

In [9]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [10]:
# Se separan características de etiquetas
features = data.iloc[:, 0:-1]
target = data.iloc[:, -1]

In [11]:
# Número de muestras
print(features.shape)
print(target.shape)
print(len(data[data.classes == 0]))
print(len(data[data.classes == 1]))

(3152, 23)
(3152,)
2864
288


# Modelo sin hacer sobre y sub muestreo, tomando 40 % de prueba y 60% de entrenamiento

In [13]:
# Se realiza KNN con los datos originales sin sobre y sub muestreo haciendo una partaición 
# de 70% de los datos para entrenamiento y 30% para pruebas
for k in [2, 5, 10, 15, 20]:
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=42)
    
    # KNN model
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train, y_train)
    
    # Predicciones
    y_est_train = model.predict(x_train)
    y_est_test = model.predict(x_test)
    
    # Se predcice la muestra local
    print(model.predict(local_test))
    
    # Medimos Sensibilidad y Especificidad
    tp, fp, tn, fn = get_meassure(np.array(y_test), y_est_test)
    sensitive = get_sensitive(tp, fn)
    specificity = get_specificity(tn, fp)
        
    print('--------------------------------------------')
    print('Entrenamiento: {0}'.format(metrics.accuracy_score(y_train, y_est_train)))
    print('Validación: {0}'.format(metrics.accuracy_score(y_test, y_est_test)))    
    print('--------------------------------------------')
    print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
    print('-------------------------------------')

[0 0 0 0 0 0 0 0 0 0]
--------------------------------------------
Entrenamiento: 0.9360126916975146
Validación: 0.8992862807295797
--------------------------------------------
Sensibilidad: 0.058823529411764705, Especificidad: 0.9868651488616462
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
--------------------------------------------
Entrenamiento: 0.925965097831835
Validación: 0.8961141950832673
--------------------------------------------
Sensibilidad: 0.15966386554621848, Especificidad: 0.9728546409807356
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
--------------------------------------------
Entrenamiento: 0.9185616076150185
Validación: 0.9072164948453608
--------------------------------------------
Sensibilidad: 0.1092436974789916, Especificidad: 0.9903677758318739
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
--------------------------------------------
Entrenamiento: 0.9185616076150185
Validación: 0.9072164948453608
-----------------

In [14]:
# Se realiza KNN realizando sobre y sub muestreo en los datos, además utilizando la técnica 
# de validación estratificada

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=350, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 350, 1: 350})


In [15]:
# Número de muestras
print(features.shape)
print(target.shape)
print(np.unique(target, return_counts=True))

(700, 23)
(700,)
(array([0, 1]), array([350, 350]))


In [16]:
def error_class(estimated, y_test):
    """
    Función para calcular el error en los problemas de clasificación
    
    Parameters
    ----------
    estimated: matrix
    y_test: matrix
    """
    error = 1 - np.sum(estimated == y_test) / len(y_test)
    
    return error

# Modelo con sobre y sub muestreo utilizando validación estratificada

In [19]:
k_numbers = [2, 5, 10, 15, 20]
folds = [2, 5, 10, 15, 20]

for k in k_numbers:    
    for fold in folds:
        print('-------------------------------------')
        print('Número de vecinos: {0}'.format(k))
        print('Número de folds: {0}'.format(fold))
        print('-------------------------------------')
        
        np.random.seed(12345)
        efficiency_train = np.zeros(fold)
        efficiency_val = np.zeros(fold)
        error = np.zeros(fold)
        error2 = np.zeros(fold)
        skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

        j = 0
        for train, test in skf.split(features, target):
            x_train = features[train, :]
            y_train = target[train]
            x_test = features[test, :]
            y_test = target[test]

            # KNN model
            model = KNeighborsClassifier(n_neighbors=k)
            model.fit(x_train, y_train).score(x_test, y_test)
            
                
            # Predicciones
            y_est_train = model.predict(x_train)
            y_est_test = model.predict(x_test)
            print(model.predict(local_test))

            # Cálculo del error
            error[j] = error_class(y_est_train, y_train)
            error2[j] = error_class(y_est_test, y_test)
            
            # Medimos Sensibilidad y Especificidad
            tp, fp, tn, fn = get_meassure(y_test, y_est_test)
            sensitive = get_sensitive(tp, fn)
            specificity = get_specificity(tn, fp)

            j += 1

        print('-------------------------------------')
        print('Entrenamiento: {0} +- {1}'.format(str(np.mean(error)), str(np.std(error))))
        print('Validación: {0} +- {1}'.format(str(np.mean(error2)), str(np.std(error2))))
        print('-------------------------------------')
        print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
        print('-------------------------------------')

-------------------------------------
Número de vecinos: 2
Número de folds: 2
-------------------------------------
[0 1 1 1 0 0 0 0 0 0]
[1 0 1 0 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.08857142857142858 +- 0.002857142857142836
Validación: 0.26857142857142857 +- 0.005714285714285672
-------------------------------------
Sensibilidad: 0.6685714285714286, Especificidad: 0.8057142857142857
-------------------------------------
-------------------------------------
Número de vecinos: 2
Número de folds: 5
-------------------------------------
[1 0 1 0 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
[1 0 1 0 0 0 0 0 0 0]
[1 0 1 0 0 0 0 0 0 0]
[1 0 1 0 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.07464285714285712 +- 0.0066240132110683735
Validación: 0.24571428571428572 +- 0.01603567451474549
-------------------------------------
Sensibilidad: 0.6714285714285714, Especificidad: 0.8285714285714286
-------------------------------------
--------------------

[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.1900983721381087 +- 0.0051394941364119075
Validación: 0.21865942028985502 +- 0.06566439670376138
-------------------------------------
Sensibilidad: 0.8260869565217391, Especificidad: 0.6956521739130435
-------------------------------------
-------------------------------------
Número de vecinos: 10
Número de folds: 20
-------------------------------------
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 0 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.18886944173088752 +- 0.004862484689169046
Valid