In [1]:
import numpy as np
import pandas as pd
import scipy as sc
import warnings

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.utils import resample
from test_data import local_test

warnings.filterwarnings('ignore')

In [2]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [3]:
# Se separan características de etiquetas
features = data.iloc[:, 0:-1]
target = data.iloc[:, -1]

In [4]:
# Número de muestras
print(features.shape)
print(target.shape)
print(len(data[data.classes == 0]))
print(len(data[data.classes == 1]))

(3152, 23)
(3152,)
2864
288


In [5]:
# Convertimos en numpy array
features = np.array(features)
target = np.array(target)

In [6]:
kernels = ['rbf', 'linear']
gamma_values = [0.1, 1]
c_values = [0.01, 1, 10]

for kernel in kernels:
    for gamma_value in gamma_values:
        for c_value in c_values:
            
            print('-------------------------------------')
            print('Kernel: {0}'.format(kernel))
            print('Gamma: {0}'.format(gamma_value))
            print('C: {0}'.format(c_value))
            print('-------------------------------------')

            fold = 6
            np.random.seed(12345)
            efficiency_train = np.zeros(fold)
            efficiency_validation = np.zeros(fold)
            support_vectors_percent = np.zeros(fold)

            skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

            j = 0
            for train, test in skf.split(features, target):
                x_train = features[train, :]
                y_train = target[train]
                x_test = features[test, :]
                y_test = target[test]
                
                 #Normalizamos los datos
                scaler = preprocessing.StandardScaler().fit(x_train)
                x_train = scaler.transform(x_train)
                x_test = scaler.transform(x_test)

                # Random Forest model
                model = SVC(gamma=gamma_value, C=c_value, kernel=kernel, decision_function_shape='ovo')
                model = model.fit(x_train, y_train)

                # Realizamos predicciones
                y_est_train = model.predict(x_train)
                y_est_test = model.predict(x_test)
                print(model.predict(local_test))
                
                # Medimos Sensibilidad y Especificidad
                tp, fp, tn, fn = get_meassure(y_test, y_est_test)
                sensitive = get_sensitive(tp, fn)
                specificity = get_specificity(tn, fp)

                # Evaluamos las predicciones del modelo con los datos de test
                efficiency_train[j] = np.mean(y_est_train.ravel() == y_train.ravel())
                efficiency_validation[j] = np.mean(y_est_test.ravel() == y_test.ravel())
                support_vectors_percent[j] = len(model.support_vectors_) / len(x_train)
                j += 1

            print('-------------------------------------')
            print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
            print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
            print('% de Vectores de Soporte = {0}'.format(np.mean(support_vectors_percent))) 
            print('-------------------------------------')
            print('-------------------------------------')
            print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
            print('-------------------------------------')

-------------------------------------
Kernel: rbf
Gamma: 0.1
C: 0.01
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9086294386810355 +- 1.6400239499796167e-05
Validación: 0.9086293680970486 +- 8.193886292944893e-05
% de Vectores de Soporte = 0.21186531993950766
-------------------------------------
-------------------------------------
Sensibilidad: 0.0, Especificidad: 1.0
-------------------------------------
-------------------------------------
Kernel: rbf
Gamma: 0.1
C: 1
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9595810075868646 +- 0.0021283558868342056
Validación: 0.9536731245096265 +- 0.009138496261562145
% de Vectores de Sop

In [7]:
# Sobre y sub muestreo

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=380, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 380, 1: 380})


In [8]:
# Número de muestras
print(features.shape)
print(target.shape)
print(np.unique(target, return_counts=True))

(760, 23)
(760,)
(array([0, 1]), array([380, 380]))


In [9]:
kernels = ['rbf', 'linear']
gamma_values = [0.1, 1]
c_values = [0.01, 1, 10]

for kernel in kernels:
    for gamma_value in gamma_values:
        for c_value in c_values:
            
            print('-------------------------------------')
            print('Kernel: {0}'.format(kernel))
            print('Gamma: {0}'.format(gamma_value))
            print('C: {0}'.format(c_value))
            print('-------------------------------------')

            fold = 6
            np.random.seed(12345)
            efficiency_train = np.zeros(fold)
            efficiency_validation = np.zeros(fold)
            support_vectors_percent = np.zeros(fold)

            skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

            j = 0
            for train, test in skf.split(features, target):
                x_train = features[train, :]
                y_train = target[train]
                x_test = features[test, :]
                y_test = target[test]

                # Random Forest model
                model = SVC(gamma=gamma_value, C=c_value, kernel=kernel, decision_function_shape='ovo')
                model = model.fit(x_train, y_train)

                # Realizamos predicciones
                y_est_train = model.predict(x_train)
                y_est_test = model.predict(x_test)
                print(model.predict(local_test))

                # Evaluamos las predicciones del modelo con los datos de test
                efficiency_train[j] = np.mean(y_est_train.ravel() == y_train.ravel())
                efficiency_validation[j] = np.mean(y_est_test.ravel() == y_test.ravel())
                support_vectors_percent[j] = len(model.support_vectors_) / len(x_train)
                
                # Medimos Sensibilidad y Especificidad
                tp, fp, tn, fn = get_meassure(y_test, y_est_test)
                sensitive = get_sensitive(tp, fn)
                specificity = get_specificity(tn, fp)
                
                j += 1

            print('-------------------------------------')
            print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
            print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
            print('% de Vectores de Soporte = {0}'.format(np.mean(support_vectors_percent))) 
            print('-------------------------------------')
            print('-------------------------------------')
            print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
            print('-------------------------------------')

-------------------------------------
Kernel: rbf
Gamma: 0.1
C: 0.01
-------------------------------------
[0 0 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.7902682386295572 +- 0.05295638822806953
Validación: 0.665798611111111 +- 0.03896079920379523
% de Vectores de Soporte = 1.0
-------------------------------------
-------------------------------------
Sensibilidad: 0.3333333333333333, Especificidad: 1.0
-------------------------------------
-------------------------------------
Kernel: rbf
Gamma: 0.1
C: 1
-------------------------------------
[0 0 1 0 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
[0 0 1 1 0 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9994734057421235 +- 0.0007447181352981928
Validación: 0.8381489748677248 +- 0.004125581264060201
% de Vectores de Soporte = 