In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.utils import resample

warnings.filterwarnings('ignore')

In [64]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [65]:
# Sobre y sub muestreo

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=380, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 380, 1: 380})


In [66]:
pca = PCA(.93)
pca = pca.fit(features)
print("Explained Variance: {0}".format(pca.explained_variance_ratio_))
print(pca.components_)

Explained Variance: [0.73008277 0.13584966 0.07637629]
[[ 6.64502672e-04  8.67051428e-06  1.08290465e-04  3.61655605e-04
  -1.43533672e-04  2.47721001e-04  2.05906722e-05 -1.59899289e-04
  -1.38659130e-05  0.00000000e+00  2.32739239e-05 -3.55526898e-04
  -6.14162081e-04 -3.29916554e-04 -3.29916554e-04 -3.29916554e-04
  -9.95433856e-04 -7.47367674e-04 -1.05582613e-01  6.64520657e-03
   6.91430986e-01  3.32942169e-04  7.14652476e-01]
 [-1.38752592e-03  1.59114388e-04 -3.93985402e-04  3.11332170e-04
  -1.04238811e-03 -9.64105660e-04 -1.12338983e-03  1.97416617e-03
  -4.31437242e-05  0.00000000e+00 -8.89744455e-04  2.70249630e-03
   3.96017885e-03  1.54603878e-03  1.54603878e-03  1.54603878e-03
   7.93251577e-01  9.12557922e-04 -7.81584925e-02 -2.17710796e-02
  -4.38732598e-01 -6.25389302e-03  4.14249491e-01]
 [-5.73132156e-04  4.61226886e-04  1.50478031e-04 -1.74992453e-04
  -2.66810290e-04  9.66147359e-04 -5.47846116e-04 -3.40029547e-04
   6.93841749e-04 -0.00000000e+00 -6.42887191e-04 -

# RandomForest Classifier

In [67]:
for n_component in [1, 2, 5, 10]:
    pca = PCA(n_components=n_component)
    new_features = pca.fit_transform(features)
    fold = 6
    np.random.seed(12345)
    efficiency_train = np.zeros(fold)
    efficiency_validation = np.zeros(fold)

    skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

    j = 0
    for train, test in skf.split(new_features, target):
        x_train = features[train, :]
        y_train = target[train]
        x_test = features[test, :]
        y_test = target[test]

        # Random Forest model
        model = RandomForestClassifier(n_estimators=20)
        model = model.fit(x_train, y_train)

        # Realizamos predicciones
        y_est_train = model.predict(x_train)
        y_est_test = model.predict(x_test)

        # Evaluamos las predicciones del modelo con los datos de test
        efficiency_train[j] = np.mean(y_est_train == y_train)
        efficiency_validation[j] = np.mean(y_est_test == y_test)
        
        # Medimos Sensibilidad y Especificidad
        tp, fp, tn, fn = get_meassure(y_test, y_est_test)
        sensitive = get_sensitive(tp, fn)
        specificity = get_specificity(tn, fp)
        
        j += 1
    
    print('-------------------------------------')
    print('Número de componentes: {0}'.format(n_component))
    print('-------------------------------------')
    print('-------------------------------------')
    print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
    print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
    print('-------------------------------------')
    print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
    print('-------------------------------------')

-------------------------------------
Número de componentes: 1
-------------------------------------
-------------------------------------
Entrenamiento: 0.9986847622090006 +- 0.0010840902058346005
Validación: 0.9618055555555557 +- 0.01486440879342429
-------------------------------------
Sensibilidad: 0.9047619047619048, Especificidad: 0.9523809523809523
-------------------------------------
-------------------------------------
Número de componentes: 2
-------------------------------------
-------------------------------------
Entrenamiento: 0.9986847622090006 +- 0.0010840902058346005
Validación: 0.9618055555555557 +- 0.01486440879342429
-------------------------------------
Sensibilidad: 0.9047619047619048, Especificidad: 0.9523809523809523
-------------------------------------
-------------------------------------
Número de componentes: 5
-------------------------------------
-------------------------------------
Entrenamiento: 0.9986847622090006 +- 0.0010840902058346005
Validación

# ANN

In [55]:
for n_component in [1, 2, 5, 10]:
    pca = PCA(n_components=n_component)
    new_features = pca.fit_transform(features)
    fold = 3
    np.random.seed(12345)
    efficiency_train = np.zeros(fold)
    efficiency_validation = np.zeros(fold)

    skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

    j = 0
    for train, test in skf.split(new_features, target):
        x_train = features[train, :]
        y_train = target[train]
        x_test = features[test, :]
        y_test = target[test]

        # ANN model
        model = MLPClassifier(hidden_layer_sizes=(20, 20), activation='logistic', max_iter=500)
        model.fit(x_train, y_train)

        # Realizamos predicciones
        y_est_train = model.predict(x_train)
        y_est_test = model.predict(x_test)

        # Evaluamos las predicciones del modelo con los datos de test
        efficiency_train[j] = np.mean(y_est_train == y_train)
        efficiency_validation[j] = np.mean(y_est_test == y_test)
        
        # Medimos Sensibilidad y Especificidad
        tp, fp, tn, fn = get_meassure(y_test, y_est_test)
        sensitive = get_sensitive(tp, fn)
        specificity = get_specificity(tn, fp)
        
        j += 1

    
    print('-------------------------------------')
    print('Número de componentes: {0}'.format(n_component))
    print('-------------------------------------')
    print('-------------------------------------')
    print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
    print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
    print('-------------------------------------')
    print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
    print('-------------------------------------')

-------------------------------------
Número de componentes: 1
-------------------------------------
-------------------------------------
Entrenamiento: 0.9493142655587024 +- 0.015305208096104006
Validación: 0.9078865141857267 +- 0.006798162106472928
-------------------------------------
Sensibilidad: 0.9285714285714286, Especificidad: 0.8809523809523809
-------------------------------------
-------------------------------------
Número de componentes: 2
-------------------------------------
-------------------------------------
Entrenamiento: 0.9493142655587024 +- 0.015305208096104006
Validación: 0.9078865141857267 +- 0.006798162106472928
-------------------------------------
Sensibilidad: 0.9285714285714286, Especificidad: 0.8809523809523809
-------------------------------------
-------------------------------------
Número de componentes: 5
-------------------------------------
-------------------------------------
Entrenamiento: 0.9493142655587024 +- 0.015305208096104006
Validación:

# SVM

In [57]:
for n_component in [1, 2, 5, 10]:
    pca = PCA(n_components=n_component)
    new_features = pca.fit_transform(features)
    fold = 6
    np.random.seed(12345)
    efficiency_train = np.zeros(fold)
    efficiency_validation = np.zeros(fold)
    support_vectors_percent = np.zeros(fold)

    skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

    j = 0
    for train, test in skf.split(new_features, target):
        x_train = features[train, :]
        y_train = target[train]
        x_test = features[test, :]
        y_test = target[test]

        # Random Forest model
        model = SVC(gamma=1, C=0.01, kernel='linear', decision_function_shape='ovo')
        model = model.fit(x_train, y_train)

        # Realizamos predicciones
        y_est_train = model.predict(x_train)
        y_est_test = model.predict(x_test)

        # Evaluamos las predicciones del modelo con los datos de test
        efficiency_train[j] = np.mean(y_est_train.ravel() == y_train.ravel())
        efficiency_validation[j] = np.mean(y_est_test.ravel() == y_test.ravel())
        support_vectors_percent[j] = len(model.support_vectors_) / len(x_train)
        
        # Medimos Sensibilidad y Especificidad
        tp, fp, tn, fn = get_meassure(y_test, y_est_test)
        sensitive = get_sensitive(tp, fn)
        specificity = get_specificity(tn, fp)
        
        j += 1
    
    print('-------------------------------------')
    print('Número de componentes: {0}'.format(n_component))
    print('-------------------------------------')
    print('-------------------------------------')
    print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
    print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
    print('% de Vectores de Soporte = {0}'.format(np.mean(support_vectors_percent))) 
    print('-------------------------------------')
    print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
    print('-------------------------------------')

-------------------------------------
Número de componentes: 1
-------------------------------------
-------------------------------------
Entrenamiento: 0.8681609298672951 +- 0.007625508792025321
Validación: 0.863095238095238 +- 0.03898611779252276
% de Vectores de Soporte = 0.556317300909103
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.7777777777777778
-------------------------------------
-------------------------------------
Número de componentes: 2
-------------------------------------
-------------------------------------
Entrenamiento: 0.8681609298672951 +- 0.007625508792025321
Validación: 0.863095238095238 +- 0.03898611779252276
% de Vectores de Soporte = 0.556317300909103
-------------------------------------
Sensibilidad: 0.9365079365079365, Especificidad: 0.7777777777777778
-------------------------------------
-------------------------------------
Número de componentes: 5
-------------------------------------
---------------------