In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn as skl
import pandas as pd
import numpy as np
import seaborn as sns

from itertools import repeat

from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier

from breast_cancer_dataset.base import Dataloder
from utils.config import SEED, XGB_COLS, XGB_CONFIG, N_ESTIMATORS, MAX_DEPTH, MODEL_FILES
from utils.functions import get_path, bulk_data, search_files, get_filename

Segmentation Models: using `keras` framework.


In [3]:
# Los valores posibles son segmentation, classification
task_type = 'classification'
# Los valores disponibles son PATCHES, COMPLETE_IMAGE
experiment = 'PATCHES'
# Nombre del experimento
experiment_name = 'EJEC_ROI_TEST2'

# Se setean las carpetas para almacenar las variables del modelo en función del experimento.
model_config = MODEL_FILES
model_config.set_model_name(name=experiment_name)

db = pd.read_excel(model_config.model_db_desc_csv, dtype=object, index_col=None)

### CLASIFICADOR CON LAS PROBABILIDADES

In [80]:
cnn_predictions_dir = get_path(model_config.model_predictions_cnn_dir)

merge_list = []
for weight, frozen_layers in zip([*repeat('imagenet', 6), 'random'], ['ALL', '0FT', '1FT', '2FT', '3FT', '4FT']):
    
    data = db[['PROCESSED_IMG', 'IMG_LABEL', 'TRAIN_VAL', *XGB_COLS[XGB_CONFIG]]].copy()
    data.loc[:, 'LABEL'] = data.IMG_LABEL.map({k: v for v, k in enumerate(sorted(data.IMG_LABEL.unique(), reverse=False))})
    
    l = []
    for file in search_files(get_path(cnn_predictions_dir, weight, frozen_layers, create=False), ext='csv'):

        l.append(
            pd.read_csv(file, sep=';')[['PROCESSED_IMG', 'PROBABILTY']].\
                assign(WEIGHTS=weight, FT=frozen_layers, CNN=get_filename(file))
        )

    merge_list.append(pd.merge(left=data, right=pd.concat(l, ignore_index=True), on='PROCESSED_IMG', how='left'))

all_data = pd.concat(merge_list, ignore_index=True)

all_data.head()

Unnamed: 0,PROCESSED_IMG,IMG_LABEL,TRAIN_VAL,LABEL,PROBABILTY,WEIGHTS,FT,CNN
0,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,MALIGNANT,train,1,0.993601,imagenet,ALL,DenseNet_probs
1,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,MALIGNANT,train,1,0.519766,imagenet,ALL,InceptionV3_probs
2,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,MALIGNANT,train,1,0.987941,imagenet,ALL,ResNet50_probs
3,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,MALIGNANT,train,1,0.494851,imagenet,ALL,VGG16_probs
4,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,MALIGNANT,train,1,0.705345,imagenet,ALL,DenseNet_probs


In [81]:
# Se escoge el mejor modelo en función de las metricas AUC de validacion
cnn_selection = all_data.groupby(['CNN', 'FT', 'WEIGHTS', 'TRAIN_VAL'], as_index=False).apply(
    lambda x: pd.Series({
        'AUC': roc_auc_score(x.LABEL, x.PROBABILTY),
        'ACCURACY': accuracy_score(x.LABEL, round(x.PROBABILTY)),
        'RECALL': recall_score(x.LABEL, round(x.PROBABILTY)),
        'PRECISION': precision_score(x.LABEL, round(x.PROBABILTY)),
        'F1': f1_score(x.LABEL, round(x.PROBABILTY))
    })
)

selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('AUC', ascending=False).\
    groupby('CNN', as_index=False).first()
selected_cnns

Unnamed: 0,CNN,FT,WEIGHTS,TRAIN_VAL,AUC,ACCURACY,RECALL,PRECISION,F1
0,DenseNet_probs,3FT,imagenet,val,0.884951,0.816162,0.731225,0.889423,0.802603
1,InceptionV3_probs,4FT,imagenet,val,0.859161,0.779798,0.814229,0.768657,0.790787
2,ResNet50_probs,2FT,imagenet,val,0.880868,0.806061,0.770751,0.83691,0.802469
3,VGG16_probs,1FT,imagenet,val,0.818272,0.747475,0.727273,0.766667,0.74645


In [82]:
selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('F1', ascending=False).\
    groupby('CNN', as_index=False).first()
selected_cnns

Unnamed: 0,CNN,FT,WEIGHTS,TRAIN_VAL,AUC,ACCURACY,RECALL,PRECISION,F1
0,DenseNet_probs,2FT,imagenet,val,0.876474,0.808081,0.826087,0.803846,0.814815
1,InceptionV3_probs,4FT,imagenet,val,0.859161,0.779798,0.814229,0.768657,0.790787
2,ResNet50_probs,4FT,imagenet,val,0.874236,0.80202,0.833992,0.790262,0.811538
3,VGG16_probs,1FT,imagenet,val,0.818272,0.747475,0.727273,0.766667,0.74645


In [83]:
selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('RECALL', ascending=False).\
    groupby('CNN', as_index=False).first()
selected_cnns

Unnamed: 0,CNN,FT,WEIGHTS,TRAIN_VAL,AUC,ACCURACY,RECALL,PRECISION,F1
0,DenseNet_probs,2FT,imagenet,val,0.876474,0.808081,0.826087,0.803846,0.814815
1,InceptionV3_probs,4FT,imagenet,val,0.859161,0.779798,0.814229,0.768657,0.790787
2,ResNet50_probs,4FT,imagenet,val,0.874236,0.80202,0.833992,0.790262,0.811538
3,VGG16_probs,3FT,imagenet,val,0.536504,0.511111,1.0,0.511111,0.676471


In [84]:
selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('ACCURACY', ascending=False).\
    groupby('CNN', as_index=False).first()
selected_cnns

Unnamed: 0,CNN,FT,WEIGHTS,TRAIN_VAL,AUC,ACCURACY,RECALL,PRECISION,F1
0,DenseNet_probs,3FT,imagenet,val,0.884951,0.816162,0.731225,0.889423,0.802603
1,InceptionV3_probs,2FT,imagenet,val,0.841326,0.781818,0.766798,0.798354,0.782258
2,ResNet50_probs,2FT,imagenet,val,0.880868,0.806061,0.770751,0.83691,0.802469
3,VGG16_probs,1FT,imagenet,val,0.818272,0.747475,0.727273,0.766667,0.74645


In [85]:
selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('PRECISION', ascending=False).\
    groupby('CNN', as_index=False).first()
selected_cnns

Unnamed: 0,CNN,FT,WEIGHTS,TRAIN_VAL,AUC,ACCURACY,RECALL,PRECISION,F1
0,DenseNet_probs,3FT,imagenet,val,0.884951,0.816162,0.731225,0.889423,0.802603
1,InceptionV3_probs,2FT,imagenet,val,0.841326,0.781818,0.766798,0.798354,0.782258
2,ResNet50_probs,1FT,imagenet,val,0.848822,0.785859,0.711462,0.84507,0.772532
3,VGG16_probs,1FT,imagenet,val,0.818272,0.747475,0.727273,0.766667,0.74645


#### Observando los resultados, la métrica a optimizar será AUC y despues se decide el threshold

In [86]:
selected_cnns = cnn_selection[cnn_selection.TRAIN_VAL == 'val'].sort_values('AUC', ascending=False).\
    groupby('CNN', as_index=False).first()

final_list = []
for _, row in selected_cnns.iterrows():
    final_list.append(
        all_data[(all_data.CNN == row.CNN) & (all_data.FT == row.FT) & (all_data.WEIGHTS == row.WEIGHTS)]
    )

final_df = pd.concat(final_list, ignore_index=True).\
    set_index(['PROCESSED_IMG', 'LABEL', 'TRAIN_VAL', *XGB_COLS[XGB_CONFIG], 'CNN'])['PROBABILTY'].unstack()\
    .reset_index()

final_df

CNN,PROCESSED_IMG,LABEL,TRAIN_VAL,DenseNet_probs,InceptionV3_probs,ResNet50_probs,VGG16_probs
0,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,1,train,0.719723,0.778808,0.708339,0.596965
1,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,1,val,0.881153,0.968379,0.938749,0.997996
2,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,1,train,0.845210,0.645296,0.551822,0.890132
3,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,1,train,0.704956,0.554972,0.431385,0.421509
4,..\data\02_PROCESED\CONF1\CBIS_DDSM\CROP\CONF0...,0,train,0.314429,0.794002,0.841224,0.806608
...,...,...,...,...,...,...,...
1642,..\data\02_PROCESED\CONF1\INBreast\CROP\CONF0\...,0,train,0.069483,0.127799,0.049580,0.184604
1643,..\data\02_PROCESED\CONF1\INBreast\CROP\CONF0\...,0,val,0.273734,0.123208,0.054974,0.226513
1644,..\data\02_PROCESED\CONF1\INBreast\CROP\CONF0\...,0,val,0.670547,0.959106,0.898885,0.999411
1645,..\data\02_PROCESED\CONF1\INBreast\CROP\CONF0\...,0,train,0.632090,0.860807,0.938336,0.981641


In [87]:
# generación del conjunto de datos de train para gradient boosting
cols = [*XGB_COLS[XGB_CONFIG], *all_data.CNN.unique().tolist()]
train_x = final_df.loc[final_df.TRAIN_VAL == 'train', cols].values
train_y = final_df.loc[final_df.TRAIN_VAL == 'train', 'LABEL']

In [88]:
#### Empiezan los algoritmos de model ensambling
import warnings
warnings.filterwarnings('ignore')

In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(RandomForestClassifier(random_state=81), 
                   param_grid={'n_estimators': [100, 150, 200, 250, 300], 'max_depth': np.arange(0, 10)}, 
                   scoring='roc_auc', 
                   cv=10) 
clf.fit(train_x, train_y)
data_csv = final_df[['PROCESSED_IMG', 'TRAIN_VAL', 'LABEL']].assign(PREDICTION=clf.predict(final_df[cols]))
data_csv.groupby('TRAIN_VAL').apply(lambda x: pd.Series({
        'AUC': roc_auc_score(x.LABEL, x.PREDICTION),
        'ACCURACY': accuracy_score(x.LABEL, round(x.PREDICTION)),
        'RECALL': recall_score(x.LABEL, round(x.PREDICTION)),
        'PRECISION': precision_score(x.LABEL, round(x.PREDICTION)),
        'F1': f1_score(x.LABEL, round(x.PREDICTION))
    }))

Unnamed: 0_level_0,AUC,ACCURACY,RECALL,PRECISION,F1
TRAIN_VAL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,0.861589,0.861111,0.840407,0.882353,0.86087
val,0.838753,0.838384,0.822134,0.855967,0.83871


In [94]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GridSearchCV(GradientBoostingClassifier(random_state=81), 
                   param_grid={'n_estimators': [100, 150, 200, 250, 300], 'max_depth': np.arange(0, 10), 'criterion': ['squared_error', 'friedman_mse']}, 
                   scoring='roc_auc', 
                   cv=10) 
clf.fit(train_x, train_y)
data_csv = final_df[['PROCESSED_IMG', 'TRAIN_VAL', 'LABEL']].assign(PREDICTION=clf.predict(final_df[cols]))
data_csv.groupby('TRAIN_VAL').apply(lambda x: pd.Series({
        'AUC': roc_auc_score(x.LABEL, x.PREDICTION),
        'ACCURACY': accuracy_score(x.LABEL, round(x.PREDICTION)),
        'RECALL': recall_score(x.LABEL, round(x.PREDICTION)),
        'PRECISION': precision_score(x.LABEL, round(x.PREDICTION)),
        'F1': f1_score(x.LABEL, round(x.PREDICTION))
    }))

Unnamed: 0_level_0,AUC,ACCURACY,RECALL,PRECISION,F1
TRAIN_VAL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,0.864789,0.864583,0.855688,0.876522,0.865979
val,0.84055,0.840404,0.833992,0.850806,0.842315


In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(RandomForestRegressor(random_state=81), 
                   param_grid={'n_estimators': [100, 150, 200, 250, 300], 'max_depth': np.arange(0, 10)}, 
                   scoring='roc_auc', 
                   cv=10) 
clf.fit(train_x, train_y)
data_csv = final_df[['PROCESSED_IMG', 'TRAIN_VAL', 'LABEL']].assign(PREDICTION=clf.predict(final_df[cols]))
data_csv.groupby('TRAIN_VAL').apply(lambda x: pd.Series({
        'AUC': roc_auc_score(x.LABEL, x.PREDICTION),
        'ACCURACY': accuracy_score(x.LABEL, round(x.PREDICTION)),
        'RECALL': recall_score(x.LABEL, round(x.PREDICTION)),
        'PRECISION': precision_score(x.LABEL, round(x.PREDICTION)),
        'F1': f1_score(x.LABEL, round(x.PREDICTION))
    }))

Unnamed: 0_level_0,AUC,ACCURACY,RECALL,PRECISION,F1
TRAIN_VAL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,0.938557,0.861111,0.850594,0.874346,0.862306
val,0.891394,0.834343,0.837945,0.837945,0.837945


In [96]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GridSearchCV(GradientBoostingRegressor(random_state=81), 
                   param_grid={'n_estimators': [100, 150, 200, 250, 300], 'max_depth': np.arange(0, 10), 'criterion': ['squared_error', 'friedman_mse']}, 
                   scoring='roc_auc', 
                   cv=10) 
clf.fit(train_x, train_y)
data_csv = final_df[['PROCESSED_IMG', 'TRAIN_VAL', 'LABEL']].assign(PREDICTION=clf.predict(final_df[cols]))
data_csv.groupby('TRAIN_VAL').apply(lambda x: pd.Series({
        'AUC': roc_auc_score(x.LABEL, x.PREDICTION),
        'ACCURACY': accuracy_score(x.LABEL, round(x.PREDICTION)),
        'RECALL': recall_score(x.LABEL, round(x.PREDICTION)),
        'PRECISION': precision_score(x.LABEL, round(x.PREDICTION)),
        'F1': f1_score(x.LABEL, round(x.PREDICTION))
    }))

Unnamed: 0_level_0,AUC,ACCURACY,RECALL,PRECISION,F1
TRAIN_VAL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,0.934658,0.861979,0.845501,0.879859,0.862338
val,0.891941,0.838384,0.83004,0.850202,0.84


RandomForestRegressor(max_depth=3, n_estimators=250, random_state=81)