In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import warnings

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn import metrics
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from test_data import local_test

warnings.filterwarnings('ignore')

In [2]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [3]:
# Se separan características de etiquetas
features = data.iloc[:, 0:-1]
target = data.iloc[:, -1]

In [4]:
# Número de muestras
print(features.shape)
print(target.shape)
print(len(data[data.classes == 0]))
print(len(data[data.classes == 1]))

(3152, 23)
(3152,)
2864
288


In [5]:
# Convertimos en numpy array
features = np.array(features)
target = np.array(target)

# Modelo sin hacer sobre y sub muestreo, tomando 30% de prueba y 70% de entrenamiento

In [8]:
# Normalizamos los datos de entrenamientohttp://localhost:8888/notebooks/ANN.ipynb#Modelo-sin-hacer-sobre-y-sub-muestreo
features = StandardScaler().fit_transform(features)

# Realiamos la partición del conjunto de datos para entrenamiento y pruebas
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=.3, random_state=42)

# Definimos el modelo y lo entrenamos
model = MLPClassifier(hidden_layer_sizes=(36, 36), activation='tanh', max_iter=500)
model.fit(x_train, y_train)

# Realizamos predicciones
y_est_train = model.predict(x_train)
y_est_test = model.predict(x_test)

# Medimos la exactitud de clasificación
score1 = model.score(x_train, y_train)
score2 = model.score(x_test, y_test)

print('-------------------------------------')
print('Entrenamiento: '.format(score1))
print('Validación: '.format(score2))
print('-------------------------------------')

# Medimos Sensibilidad y Especificidad
tp, fp, tn, fn = get_meassure(y_test, y_est_test)
sensitive = get_sensitive(tp, fn)
specificity = get_specificity(tn, fp)

# Predecimos la muestra local
print(model.predict(local_test))

print('-------------------------------------')
print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
print('-------------------------------------')

-------------------------------------
Entrenamiento: 
Validación: 
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
-------------------------------------
Sensibilidad: 0.72, Especificidad: 0.983451536643026
-------------------------------------


Aunque la tasa de aciertos que el modelo dice tener es alta, se puede observar que los datos están en general mal clasificados

# Modelo con sobre y sub muestreo utilizando validación estratificada

In [9]:
# Sobre y sub muestreo

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=350, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 350, 1: 350})


In [11]:
activation_functions = ['logistic', 'tanh']
neurons_config = [20, 30, (20, 20), (35, 35), (35, 35, 20)]

for activation in activation_functions:
    for neurons in neurons_config:
        
        print('-------------------------------------')
        print('Función de activación: {0}'.format(activation))
        print('Número de neuronas: {0}'.format(neurons))
        print('-------------------------------------')
        
        fold = 3
        np.random.seed(12345)
        efficiency_train = np.zeros(fold)
        efficiency_validation = np.zeros(fold)

        skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

        j = 0
        for train, test in skf.split(features, target):
            x_train = features[train, :]
            y_train = target[train]
            x_test = features[test, :]
            y_test = target[test]

            # ANN model
            model = MLPClassifier(hidden_layer_sizes=neurons, activation=activation, max_iter=500)
            model.fit(x_train, y_train)

            # Realizamos predicciones
            y_est_train = model.predict(x_train)
            y_est_test = model.predict(x_test)
            print(model.predict(local_test))

            # Evaluamos las predicciones del modelo con los datos de test
            efficiency_train[j] = np.mean(y_est_train == y_train)
            efficiency_validation[j] = np.mean(y_est_test == y_test)
            
            # Medimos Sensibilidad y Especificidad
            tp, fp, tn, fn = get_meassure(y_test, y_est_test)
            sensitive = get_sensitive(tp, fn)
            specificity = get_specificity(tn, fp)
            
            j += 1

        print('-------------------------------------')
        print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
        print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
        print('-------------------------------------')
        print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
        print('-------------------------------------')

-------------------------------------
Función de activación: logistic
Número de neuronas: 20
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9149731606813152 +- 0.013399832138961225
Validación: 0.8957289517634345 +- 0.011079326638343807
-------------------------------------
Sensibilidad: 0.9051724137931034, Especificidad: 0.896551724137931
-------------------------------------
-------------------------------------
Función de activación: logistic
Número de neuronas: 30
-------------------------------------
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9271395033197608 +- 0.011509250769866157
Validación: 0.9028514588859416 +- 0.0054196950367370175
-------------------------------------
Sensibilidad: 0.9137931034482759, Especificidad: 0.8879310344827587
-------------------------------------
----------------

En este caso vemos que el modelo realiza predicciones más acertadas