1. Compresión del problema

In [None]:
# Leemos información del documento Excel
import pandas as pd
dataFrame = pd.read_csv('../Material/Database.csv', sep=';')
print(dataFrame)

In [None]:
# Extraer información de interés
import numpy as np
clases, frec = np.unique(dataFrame.Class, return_counts=True)
print(clases, frec)

In [None]:
# Seleccionar una muestra aleatoria de nuestra base de datos
import random
import cv2
import matplotlib.pyplot as plt
num = random.randint(0,len(dataFrame))
print(num)
name_img = dataFrame.ID[num]
print(name_img)

img = cv2.imread('../Material/Images/' + name_img)
plt.imshow(img, cmap='gray')
plt.show()

In [None]:
# Leemos las máscaras y ploteamos los contornos sobre la imagen original
rnf1_mask = cv2.imread('../Material/RNFL_masks/' + name_img, 0)
retina_mask = cv2.imread('../Material/Retina_masks/' + name_img, 0)

cont_rnfl, _ = cv2.findContours(rnfl_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cont_retina, _ = cv2.findContours(retina_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

image = cv2.drawContours(img.copy(), cont_rnfl, -1 (255, 0, 0), 3)
image = cv2.drawContours(image.copy(), cont_retina, -1 (0, 255, 0), 3)

plt.imshow(image, cmap='gray')
plt.show()

2. Partición de datos externa

In [None]:
import pandas as pd
import numpy as np

# Leer la información del documento Excel original
dataFrame = pd.read_csv('../Material/Database.csv', sep=';')
print(dataFrame)

In [None]:
# Partición externa de los datos
from sklearn.model_selection import train_test_split, Kfold

# train, test = train_test_split(dataFrame, test_size=0.2, shuffle=True, random_state=42) # hold-out

kf = KFold(n_splits=5, shuffle=True, random_state=42)
bolsas = kf.split(dataFrame)

for k, (train_fold, test_fold) in enumerate(bolsas):
    train = dataFrame.iloc[train_fold]
    test = dataFrame.iloc[test_fold]

lab_train, ocur_train = np.unique(train.Class, return_counts=True)
lab_test, ocur_test = np.unique(test.Class, return_counts=True)

print(' --- TRAIN --- \nGlaucoma: ', ocur_train[0], '\nHealthy: ', ocur_train[1])
print(' --- TEST --- \nGlaucoma: ', ocur_test[0], '\nHealthy: ', ocur_test[1])

In [None]:
print(train)

In [None]:
# Aleatorizar los dataframes
train = train.sample(frac=1)
test = test.sample(frac=1)

print(train)

In [None]:
# Guardado de los modelos
import os
if not os.path.exists('../partitions'):
    os.mkdir('../partitions')
train.to_csv('../partitions/train.csv', sep=';')
test.to_csv('../partitions/test.csv', sep=';')

3. Extracción de características

In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('../partitions/train.csv', sep=';')
df_test = pd.read_csv('../partitions/test.csv', sep=';')

matriz_datos_train = feature_extraction(df_train)
matriz_datos_test = feature_extraction(df_test)

# print(np.shape(matriz_datos_test))

import os
if not os.path.exists('../features'):
    os.mkdir('../features')

np.save('../features/matriz_datos_train.npy', matriz_datos_train)
np.save('../features/matriz_datos_test.npy', matriz_datos_test)

In [None]:
def feature_extraction(df):
    
    import cv2
    import matplotlib.pyplot as plt
    
    fingerprint = []
    for i in range(0, len(df)):
#        print('[INFO] --- Extrayendo información para la muestra ', str(i))
        file = df.ID[i]
        img = cv2.imread('../Material/Images/' + file)
        rnfl_mask = cv2.imread('../Material/RNFL_masks/' + file, 0)
        retina_mask = cv2.imread('../Material/Retina_masks/' + file, 0)
        
    #    # Visualización
    #    fig, ax = plt.subplots(1,3)
    #    ax[0].imshow(img, cmap='gray'), ax[0].set_title('Imagen')
    #    ax[1].imshow(rnfl_mask, cmap='gray'), ax[1].set_title('RNFL')
    #    ax[2].imshow(retina_mask, cmap='gray'), ax[1].set_title('Retina')
        
        # ESTADÍSTICOS UNIDIMENSIONALES en la RNFL
        thickness_rnfl = []
        for j in range (0, rnfl_mask.shape[1]):
            pos = np.where(rnfl_mask[:,j]==255)
            thickness_rnfl.append(pos[0][-1]-pos[0][0])
        thickness_rnfl = np.array(thickness_rnfl)
        print(np.shape(thickness_rnfl))
        
        # Características basadas en medidas de tendencia central
        media = np.mean(thickness_rnfl)
        mediana = np.median(thickness_rnfl)
        
        # Características basadas en medidas de dispersión
        desvest = np.std(thickness_rnfl)
        
        # Características de distribución
        from scipy import stats
        asimetria = stats.skew(thickness_rnfl)
        curtosis = stats.kurtosis(thickness_rnfl)
        
        # Otras características
        minimo = np.min(thickness_rnfl)
        maximo = np.max(thickness_rnfl)
        
        # (fingerprint RNFL)
        features_RNFL = [media, mediana, desvest, asimetria, curtosis, minimo, maximo] # estadísticos unidimensionales
        
        # CARACTERÍSTICAS BIDIMENSIONALES en la estructura de la RETINA
        from skimage.measure import regionprops
        prop = regionprops(retina_mask)
        bb = prop[0].bbox
        retina = img[bb[0]:bb[2], bb[1]:bb[3], 0]
        
    #    plt.imshow(retina, cmap='gray')
    #    plt.show()
        
        # Gray-Level Coocurrence Matrix (GLCM)
        from skimage.feature import greycomatrix, greycoprops
        GLCM = greycomatrix(retina, distances=[2], angles=[90], levels=256, symmetric=True, normed=True)
        contraste = greycoprops(GLCM, 'contrast')[0,0]
        disimilitud = greycoprops(GLCM, 'dissimilarity')[0,0]
        homogeneidad = greycoprops(GLCM, 'homogeneity')[0,0]
        ASM = greycoprops(GLCM, 'ASM')[0,0]
        energia = greycoprops(GLCM, 'energy')[0,0]
        correlacion = greycoprops(GLCM, 'correlation')[0,0]
        
        # Local Binary Patterns (LBP)
        from skimage.feature import local_binary_pattern
        R=1 # radio
        P=8*R # vecinos
        lbp_image = local_binary_pattern(retina, P, R, method='uniform')
        
        lbp_image = np.uint8(lbp_image)
        
        hist_lbp = cv2.calcHist([lbp_image.ravel()], [0], None, [p+2], [0, p+2])
        hist_lbp = hist_lbp.astype('float')
        hist_lbp /= (hist_lbp.sum() + 1e-7)
        hist_lbp = hist_lbp.tolist()
        hist_lbp = [item for sublist in hist_lbp for item in sublist]
        
        # Visualización de la imagen lbp y el histograma
    #    plt.imshow(retina, cmap='gray')
    #    plt.show()
        
    #    plt.imshow(lbp_image, cmap='gray')
    #    plt.show()
        
    #    plt.plot(hist_lbp)
    #    plt.grid(True)
    #    plt.show()
        
        # Características de textura (fingerprint retina)
        features_Retina = [contraste, disimilitud, homogeneidad, ASM, energia, correlacion] + hist_lbp # Características bidimensionales
        
        # Extraer la información de la clase
        if df.Class[i] == 'Healthy':
            etiqueta = [0]
        else:
            etiqueta = [1]
        
        fingerprint.append(features_RNFL + features_Retina + etiqueta)
    #    print(np.shape(fingerprint))
    
    matriz_datos = np.array(fingerprint)
#    print(np.shape(matriz_datos))
    
    return matriz_datos

4. Selección de características

In [None]:
# Cargar los datos de entrenamiento
import numpy as np
train_matrix = np.load('../features/matriz_datos_train.npy')

# Seleccionar las features y target
X_train = train_matrix[:,:-1]
y_train = train_matrix[:, -1]

print(np.shape(X_train))

In [None]:
# Estandarización de los datos de entrenamiento
from sklearn.preprocessing import StandardScaler
estandarizador = StandardScaler()
estandarizador.fit(X_train)
mu = estandarizador.mean_
sigma = np.sqrt(estandarizador.var_)
X_train = estandarizador.transform(X_train)

# print(X_train[0])
# print(X_train_new[0])

In [None]:
# SELECCIÓN DE LOS ATRIBUTOS (CARACTERÍSTICAS)
# Estudiar si las variables siguen una distribución normal de media 0 y desviación típica 1 --> N(0,1)
from scipy.stats import kstest # Prueba de Kolmogorov-Smirnov

alpha = 0.01 # nivel de confianza del 99%
h_norm = np.zeros(X_train.shape[1])
for i in range(0, X_train.shape[1]):
    _, pvalue = kstest(X_train[:, i], 'norm')
    
    # Contraste de hipótesis
    if pvalue<=alpha:
        h_norm[i] = 0 # Los datos NO siguen una distribución normal N(0,1)
    else:
        h_norm[i] = 1 # Los datos SÍ siguen una distribución normal N(0,1)

print('0: no normal  ---- 1: sí normal: ', h_norm)

In [None]:
def draw_boxplot(data1, data2, ticks):
    import matplotlib.pyplot as plt
    
    bpl = plt.boxplot(data1, positions=np.array(range(np.shape(data1)[1]))*2.0-0.4, sym='', widths=0.5, \
                      boxprops=dict(color='red'),
                      capprops=dict(color='red'),
                      whiskerprops=dict(color='red'),
                      medianprops=dict(color='red'))
    
    bpl = plt.boxplot(data2, positions=np.array(range(np.shape(data2)[1]))*2.0-0.4, sym='', widths=0.5, \
                      boxprops=dict(color='blue'),
                      capprops=dict(color='blue'),
                      whiskerprops=dict(color='blue'),
                      medianprops=dict(color='blue'))
    
    plt.plot([], c='#D7191C', label='Glaucoma')
    plt.plot([], c='#2C7BB6', label='Healthy')
    plt.legend()
    
    plt.xticks(range(0, len(ticks)*2, 2), ticks)
    plt.xlim(-2, len(ticks)*2)
    plt.grid(True)
    plt.title('Características')
    plt.show()

In [None]:
# Estudiar la capacidad discriminativa de los atributos en función de su distribución
from scipy.stats import ttest_ind, mannwhitneyu

glaucoma_data = X_train[y_train==1]
healthy_data = X_train[y_train==0]

h = np.zeros(X_train.shape[1])
h_disc = np.zeros(X_train.shape[1])

for i in range(0, X_train.shape[1]):
    if h_norm[i] == 0: # no es normal --> comparación de medianas (mannwhitneyu)
        _, pvalue = mannwhitneyu(glaucoma_data[:,i], healthy_data[:,i])
    else: # sí es normal --> comparación de medias (ttest_ind)
        _, pvalue = ttest_ind(glaucoma_data[:,i], healthy_data[:,i])
    
    # constraste de hipótesis, estudiar el poder discriminatorio de las características
    # H0: independencia entre la característica y la clase
    if pvalue<=alpha:
        h_disc[i] = 1 # Se rechaza la H0 y, por tanto, asumimos la dependencia entre la característica y la clase
    else:
        h_disc[i] = 0 # No hay evidencia para rechazar la H0 y, por tanto, asumimos que la caract. y la clase son independientes

print('0: no discrimina, 1: sí discrimina', h_disc)

# Eliminando las variables que no son discriminatorias.
id_no_disc = np.where(h_disc==0)
X_train_disc = np.delete(X_train, id_no_disc[0], axis=1)
mu_disc = np.delete(mu, id_no_disc[0])
sigma_disc = np.delete(sigma, id_no_disc[0])

print(np.shape(X_train_disc))
print(np.shape(mu_disc))
print(np.shape(sigma_disc))

# Visualización
original_ticks = ['media', 'mediana', 'std', 'asim', 'curtosis', 'min', 'max', 'con', 'dis', 'homo', 'ASM', 'E', 'COR',
         'LBP1', 'LBP2', 'LBP3', 'LBP4', 'LBP5', 'LBP6', 'LBP7', 'LBP8', 'LBP9', 'LBP10']
draw_boxplot(glaucoma_data[:,:10], healthy_data[:,:10], original_ticks[:10])

ticks = np.delete(original_ticks, id_no_disc[0])
print('Características discriminatorias: ', ticks)
print('Características NO discriminatorias: ', np.setdiff1d(original_ticks, ticks))

In [None]:
# Realizar un análisis de CORRELACIÓN para ver la dependencia entre pares de variables

R = np.corrcoef(X_train_disc.transpose())

import matplotlib.pyplot as plt
plt.imshow(R, cmap='jet')
plt.show()

th_cor = 0.9

idx = abs(R)>th_cor
mat_tri_sup = np.triu(idx,1)
# print(mat_tri_sup.astype('uint8'))

row, col = np.where(mat_tri_sup==True)
id_corr = np.unique(col)
print(id_corr)

print('Características correladas: ', ticks[id_corr])
print('Características NO correladas: ', np.setdiff1d(ticks, ticks[id_corr]))

# Eliminamos las variables correlacionadas
X_final = np.delete(X_train_disc, id_corr, axis=1)
mu_final = np.delete(mu_disc, id_corr)
sigma_final = np.delete(sigma_disc, id_corr)
ticks = np.delete(ticks, id_corr)

print(np.shape(X_final))

In [None]:
# Guardado de matriz final de características
import os
if not os.path.exists('../final_features'):
    os.mkdir('../final_features')

y_train_exp = np.expand_dims(y_train, axis=1)
train_matrix = np.concatenate((X_final, y_train_exp), axis=1)
np.save('../final_features/train.npy', train_matrix)

In [None]:
# Repetir el proceso para la selección de las características del test

test_matrix = np.load('../features/matriz_datos_test.npy')

# Seleccionar las características y la clase
X_test = test_matrix[:,:-1]
y_test = test_matrix[:, -1]

# Eliminar las características que no son discriminatorias durante entrenamiento
# id_no_disc
X_test_disc = np.delete(X_test, id_no_disc[0], axis=1)

# Eliminar las características correladas durante entrenamiento
# id_corr
X_test_final = np.delete(X_test_disc, id_corr, axis=1)

# Estandarización de las características del test en base a la mu y la sigma del entrenamiento
X_test_final = (X_test_final-mu_final)/sigma_final

# Guardado de la matriz de datos de test
y_test_exp = np.expand_dims(ytest, axis=1)
test_matrix = np.concatenate((X_test_final, y_test_exp), axis=1)

np.save('../final_features/test.npy', test_matrix)

print(np.shape(Xtest_final))

5. Modelado

In [None]:
# Cargar los datos de entrenamiento
import numpy as np
train = np.load('../final_features/train.npy')

X_train = train[:,:-1]
y_train = train[:, -1]

print(np.shape(X_train))
print(np.shape(y_train))

In [None]:
# Definir los modelos de clasificación
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

algoritmos = {'LOGR': LogisticRegression(penalty='l2', solver='saga', max_iter=1000, random_state=42),
             'MLP': MLPClassifier(hidden_layer_sizes=[8,4], activation='relu', solver='sgd', batch_size='auto',
                                  learning_rate='adaptive', learning_rate_init=0.01, max_iter=1000, random_state=42)}
# Cross-validation interno en k=5 bolsas
from sklearn.model_selection import cross_val_score, Kfold

results={}
for nombre, alg in algoritmos.items():
    results[nombre] = cross_val_score(alg, X_train, y_train, cv=Kfold(n_splits=5, shuffle=True, random_state=42))
    print(nombre + ':  Accuracy:  %0.4f +/- %0.4f'% (results[nombre].mean(), results[nombre].std()))

In [None]:
# Definimos el modelo definitivo.
algoritmos = {'LOGR': LogisticRegression(penalty='l2', solver='saga', max_iter=1000, random_state=42),
             'MLP': MLPClassifier(hidden_layer_sizes=[8,4], activation='relu', solver='sgd', batch_size='auto',
                                  learning_rate='adaptive', learning_rate_init=0.01, max_iter=1000, random_state=42)}

LOGR_definitivo = LOGR.fit(X_train, y_train)
MLP_definitivo = MLP.fit(X_train, y_train)

# Atributos se obtienen durante el entrenamiento
print('Mínimo error cometido: ', MLP_definitivo.best_loss_)
print('Número de iteraciones llevadas a cabo: ', MLP_definitivo.n_iter_)

In [None]:
# Guardar modelos
import os
if not os.path.exists('../models'):
    os.mkdir('../models')

import pickle
with open('../models/LOGR.pickle', 'wb') as fw:
    pickle.dump(LOGR_definitivo, fw)
with open('../models/MLP.pickle', 'wb') as fw:
    pickle.dump(MLP_definitivo, fw)

6. Evaluación resultados

In [None]:
# Cargar los datos del test
import numpy as np
test = np.load('../final_features/test.npy')

X_test = test[:,:-1]
y_test = test[:, -1]

print(np.shape(X_test))
print(np.shape(y_test))

In [None]:
# Cargamos los modelos entrenados
import pickle
with open('../models/LOGR.pickle', 'rb') as fr:
    LOGR = pickle.load(fr)
with open('../models/MLP.pickle', 'rb') as fr:
    MLP = pickle.load(fr)

# Extraer las predicciones
y_pred_LOGR = LOGR.predict(X_test)
y_pred_MLP = MLP.predict(X_test)

y_pred = [y_pred_LOGR, y_pred_MLP]

# Evaluar diversas métricas de clasificación
from sklearn import metrics

from tabulate import tabulate
headers = ['', 'LOGR', 'MPL']
P, S, FS, ACC, AUC = [['Precision'], ['Sensibilidad'], ['F1-Score'], ['Accuracy'], ['AUC']]

for i in range(0,2):
    P.append(np.round(metrics.precision_score(y_test, y_pred[i]),4))
    S.append(np.round(metrics.recall_score(y_test, y_pred[i]),4))
    FS.append(np.round(metrics.f1_score(y_test, y_pred[i]),4))
    ACC.append(np.round(metrics.accuracy_score(y_test, y_pred[i]),4))
    AUC.append(np.round(metrics.roc_auc_score(y_test, y_pred[i]),4))

my_data = [tuple(P), tuple(S), tuple(FS), tuple(ACC), tuple(AUC)]
print(tabulate(my_data, headers=headers))

# Confusion matrix
print(metrics.confusion_matrix(y_test, y_pred[1]))