In [82]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from meassures import get_meassure
from meassures import get_sensitive
from meassures import get_specificity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

warnings.filterwarnings('ignore')

In [83]:
# Se cargan los datos
data = pd.read_csv('full_dataset.csv')
data.head()

Unnamed: 0,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,...,T4U_measured,FTI_measured,age,sex,TSH,T3,TT4,T4U,FTI,classes
0,0,0,0,0,0,0,0,0,0,0,...,1,1,80,1,1.4,0.8,105.0,0.88,120.0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,1,74,0,0.0,0.7,98.0,0.81,121.0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,1,32,0,1.4,1.1,121.0,1.11,109.0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,42,0,2.3,1.1,93.0,0.73,127.0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,89,1,0.8,0.8,111.0,0.68,165.0,1


In [84]:
# Sobre y sub muestreo

# Se separan los datos en conjuntos por clase
major_class = data[data.classes == 0]
minor_class = data[data.classes == 1]

# Se hace un sobremuestreo sobre la clase minoritaria
minor_class_upsampled = resample(minor_class, replace=True, n_samples=380, random_state=42)

# Se unen los conjuntos en un nuevo conjunto único de clases
data_new = pd.concat([major_class, minor_class_upsampled])

# Se separan características de etiquetas
features = data_new.iloc[:, 0:-1]
target = data_new.iloc[:, -1]

# Se hace sub muestreo de conjunto mayoritario
rus = RandomUnderSampler(random_state=0)
features, target = rus.fit_resample(features, target)
print('Tamaño de los nuevos conjuntos de clases: {0}'.format(Counter(target)))

Tamaño de los nuevos conjuntos de clases: Counter({0: 380, 1: 380})


In [85]:
model1 = LinearDiscriminantAnalysis(solver='eigen', shrinkage = 'auto')
model2 = LinearDiscriminantAnalysis()

model1.fit(features, target)
model2.fit(features, target)
model_transform1 = model1.transform(features)
model_transform2 = model2.transform(features)

print('Número original de atributos con model1:', features.shape[1])
print('Número reducido de atributos con model2:', model_transform1.shape[1])
print('------------------------------------')
print('Número original de atributos con model2:', features.shape[1])
print('Número reducido de atributos con model2:', model_transform2.shape[1])

Número original de atributos con model1: 23
Número reducido de atributos con model2: 1
------------------------------------
Número original de atributos con model2: 23
Número reducido de atributos con model2: 1


In [86]:
ratio1 = model1.explained_variance_ratio_
ratio2 = model2.explained_variance_ratio_

print(ratio1)
print(ratio2)

[0.22323773]
[1.]


In [87]:
def select_n_components(ratio, wish_variance):
    """
    select_n_components ayuda a seleccionar 
    el número de características adecuado con
    base al porcentaje objetivo que el usuario
    quiera recuperar de la información original
    """
    variance = 0
    components = 0
    
    for explained_variance in ratio:
        variance += explained_variance
        components += 1
        print('Variance: {0}'.format(variance))
        print('components: {0}'.format(components))
        if variance >= wish_variance:
            break
            
    return components

In [88]:
select_n_components(ratio1, 93)
print('----------------------')
select_n_components(ratio2, 93)

Variance: 0.22323773053359727
components: 1
----------------------
Variance: 1.0
components: 1


1

In [89]:
# Definimos un pequeño conjunto para hacer pruebas
# Primeras 5 muestras pertenecen a la clase 1
# ültimas 5 muestgras pertenecen a la clase 0
# Los datos no han sido usados ni para entrenamiento ni validación
local_test = [[0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,72,1,0.8,1.0,83.0,0.95,87.0],
[0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,45,0,1.9,1.0,82.0,0.73,112.0],
[0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,64,0,0.09,1.0,101.0,0.82,123.0],
[0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,56,1,0.0,0.8,76.0,0.77,99.0],
[1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,1,78,0,2.6,0.3,87.0,0.95,91.0], 
[1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,23,0,0.0,2.8,171.0,1.29,133],
[0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,69,0,2.6,1.8,126.0,1.02,124.0],
[0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,58,0,5.8,1.7,86.0,0.91,95.0],
[0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,29,0,0.8,1.8,99.0,1.01,98.0],
[1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,56,0,0.0,1.8,139.0,0.97,143.0]]

In [90]:
features = np.array(features)
target = np.array(target)

fold = 6
np.random.seed(12345)
efficiency_train = np.zeros(fold)
efficiency_validation = np.zeros(fold)

skf = StratifiedKFold(n_splits=fold, random_state=None, shuffle=False)

j = 0
for train, test in skf.split(features, target):
    x_train = features[train, :]
    y_train = target[train]
    x_test = features[test, :]
    y_test = target[test]

    # Random Forest model
    model = RandomForestClassifier(n_estimators=20)
    model = model.fit(x_train, y_train)

    # Realizamos predicciones
    y_est_train = model.predict(x_train)
    y_est_test = model.predict(x_test)
    print(model.predict(local_test))

    # Evaluamos las predicciones del modelo con los datos de test
    efficiency_train[j] = np.mean(y_est_train == y_train)
    efficiency_validation[j] = np.mean(y_est_test == y_test)
    
    # Medimos Sensibilidad y Especificidad
    tp, fp, tn, fn = get_meassure(y_test, y_est_test)
    sensitive = get_sensitive(tp, fn)
    specificity = get_specificity(tn, fp)
    
    j += 1

print('-------------------------------------')
print('Entrenamiento: {0} +- {1}'.format(str(np.mean(efficiency_train)), str(np.std(efficiency_train))))
print('Validación: {0} +- {1}'.format(str(np.mean(efficiency_validation)), str(np.std(efficiency_validation))))
print('-------------------------------------')
print('Sensibilidad: {0}, Especificidad: {1}'.format(sensitive, specificity))
print('-------------------------------------')

[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
[1 1 1 1 1 0 0 0 0 0]
-------------------------------------
Entrenamiento: 0.9986847622090006 +- 0.0010840902058346005
Validación: 0.9618055555555557 +- 0.01486440879342429
-------------------------------------
Sensibilidad: 0.9047619047619048, Especificidad: 0.9523809523809523
-------------------------------------
