In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

tf.keras.backend.clear_session()
import os
from src.models import base_model
from src import custom_losses, custom_metrics, optimizers
from src.data import data

Using TensorFlow backend.


In [2]:
from sklearn.metrics import accuracy_score

In [3]:
from src.visualization import visualize as viz

In [4]:
def load_model(path, dataset = 'vision_based_dataset'):
    model = base_model.BaseModel.load_model(path)

    paths = data.PATH()

    dataset_path = f'{paths.PROCESSED_DATA_PATH}/'
    test_dataset_path = f'{dataset_path}/{dataset}/'

    validation_generator, test_generator = model.get_image_data_generator(test_dataset_path, train=False, validation=True, test=True, 
                                                                          class_mode_validation='categorical', class_mode_test='categorical')

    train_generator = model.get_image_data_generator(f'{test_dataset_path}', train=True, validation=False)
    weights = model.get_class_weights(train_generator.classes, model)
    w = sorted([f.replace('weights.', '') for f in os.listdir(path) if f.startswith('weights.')], reverse=True)[0]
    print(f'model best weights file >> weights.{w}')
    model.model.load_weights(f'{path}/weights.{w}')
    model.compile(loss=custom_losses.weighted_categorical_crossentropy(weights), metrics=['categorical_accuracy'],)
    
    return model, test_generator, validation_generator, test_dataset_path

In [5]:
import numpy as np
import pandas as pd


def get_label_prediction(pred_proba):
    label_prediction = np.argmax(pred_proba, axis=1)
    return label_prediction

def get_prediction_confidence(pred_proba):
    confidence = np.max(pred_proba, axis=1)
    return confidence

def get_y_true_from_generator(generator, return_filename=False):
    y_true = [f.split('/')[1] for f in generator.filenames]
    
    if return_filename:
        filenames = [f.split('/')[0] for f in generator.filenames]
        return y_true, filenames
    return y_true

def get_pred_label_and_confidence(pred_proba, generator):
    classes = list(generator.class_indices.keys())
    label_prediction, pred_confidence = (
        get_label_prediction(pred_proba),
        get_prediction_confidence(pred_proba))
    
    return [(classes[l], pred_confidence[idx]) for idx, l in enumerate(label_prediction)]

def get_df_pred(model, generator, dataset_path):
    pred_proba = model.predict_from_generator(dataset_path, 
                                              test_generator=generator, 
                                              return_pred_proba=True,
                                              batch_size=model.batch_size)
    
    y_pred = get_pred_label_and_confidence(pred_proba, generator)
    filenames, y_true = get_y_true_from_generator(generator, 
                                                  return_filename=True)
    return pd.DataFrame({
        'y_true': y_true,
        'y_pred': [i[0] for i in y_pred],
        'y_pred_proba': [i[1] for i in y_pred],
        'filename': filenames
        })

In [6]:
def print_error_by_category(df_pred_test):
    d = df_pred_test.loc[(df_pred_test.y_true != df_pred_test.y_pred)].copy()
    d = d.groupby('y_true').size().reset_index().sort_values(0)
    d['pct'] = d[0] / df_pred_test.loc[(df_pred_test.y_true != df_pred_test.y_pred)].shape[0]
    print(d.to_latex(index=False))
    return d

In [7]:
def print_pred_proba_error_by_category(df_pred_test):
    d = df_pred_test.loc[(df_pred_test.y_true != df_pred_test.y_pred)].copy()
    d = d.groupby('y_true').agg({
        'y_pred_proba': ['mean','median','std','min','max']
    }).reset_index()
    
    print(d.round(2).to_latex(index=False))
    
    return d

In [8]:
def get_error_by_threshold(df_pred_test):
    errors = []
    errors_pct = []
    rights = []
    counts = []
    df = df_pred_test.copy()
    total_errors = df_pred_test[(df_pred_test.y_true != df_pred_test.y_pred)].shape[0]
    for th in np.arange(0, 1, .05):
        d = df[(df_pred_test.y_pred_proba >= th)]
        counts.append(d.shape[0])
        df_err = df_pred_test[((df_pred_test.y_true != df_pred_test.y_pred) 
                               & (df_pred_test.y_pred_proba >= th))]
        err = df_err.shape[0]
        err_pct = err / total_errors
        err_diff = df_pred_test[(df_pred_test.y_true != df_pred_test.y_pred)].shape[0] - err
        acc = accuracy_score(d['y_true'], d['y_pred'])
        errors.append(err)
        errors_pct.append(err_pct)
        rights.append(acc)
        
    return errors, errors_pct, rights, counts

In [9]:
def plot_errors_and_pct_series(errors, pct, counts, experiment=''):
    x = np.arange(0, 1, .05)
    colors = viz.get_random_colors(3, 4)
    fig = viz.make_subplots(specs=[[{"secondary_y": True}]])
    
    trace1_x_axis_name = 'Cantidad de casos que se predicen'
    trace1 = viz.get_series_trace(
            x, counts, line_color=colors[0],
            name=trace1_x_axis_name,
            show_legend=True,
            mode=None,
            opacity=1,
            line_width=2.5)
    fig.add_trace(trace1)
    
    trace2_x_axis_name = 'Exactitud categórica'
    trace2 = viz.get_series_trace(
            x, pct, line_color=colors[1],
            name=trace2_x_axis_name,
            show_legend=True,
            mode=None,
            opacity=1,
            line_width=2.5)
    fig.add_trace(trace2, secondary_y=True)
    
    trace3_x_axis_name = 'Porcentaje sobre el total de errores'
    trace3 = viz.get_series_trace(
            x, errors, line_color=colors[2],
            name=trace3_x_axis_name,
            show_legend=True,
            mode=None,
            opacity=1,
            line_width=2.5)
    fig.add_trace(trace3, secondary_y=True)

    # Add figure title
    fig.update_layout(
        #title_text="Error count and percentage of total errors of experiment {experiment} model"
        title_text=f"Cantidad de casos, porcentaje de errores y exactitud categórica por umbral - modelo del exp. {experiment}"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Umbral")

    # Set y-axes titles
    fig.update_yaxes(title_text=trace1_x_axis_name, secondary_y=False)
    fig.update_yaxes(title_text='Porcentaje', secondary_y=True)

    fig.show()

In [13]:
def plot_distribution_of_features_by_label(df, features=[],
                                            show_legend=False,
                                            exp=''):
    '''
    Plots the distribution of the features by the y_true label, separating
    the true positives from the false positives cases.
    '''
    import pandas as pd
    import numpy as np
    import warnings
    warnings.filterwarnings('ignore')

    import math
    from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
    try:
        init_notebook_mode(connected=True)
    except:
        pass
    import plotly.graph_objs as go
    from plotly.subplots import make_subplots
    
    assert 'y_true' in df.columns, (
        'The provided dataframe must have `y_true` column')
    assert 'y_pred' in df.columns, (
        'The provided dataframe must have `y_pred` column')

    df['right_predicted'] = df['y_pred'] == df['y_true']

    if not features:
        features = [c for c in df.columns
                    if c not in
                    ['y_true', 'y_pred', 'right_predicted', 'probs',
                     'label']
                    ]

    n_rows = math.ceil(len(features) / 2)

    fig = make_subplots(rows=n_rows, cols=1,
                        subplot_titles=[f'Modelo del experimento {exp} - {f}'
                                        for f in features])

    df_right_pred = df.loc[df.right_predicted]
    df_wrong_pred = df.loc[~df.right_predicted]

    hover_template = '<br>'.join([f'{col}'+'=%{customdata['+f'{idx}'+']}'
                                  for idx, col in enumerate(df.columns)])

    row_n = 1
    col_n = 1
    for idx, feature in enumerate(features):
        for label in df.y_true.unique():
            df_right_label = df_right_pred.loc[df_right_pred.y_pred == label]
            fig.add_trace(viz.get_box_plot_trace(
                y=df_right_label[feature],
                name=f'|label: {label} - predicción correcta|',
                marker_color='green',
                box_mean=True,
                box_points='all', # can also be outliers/suspectedoutliers/False
                jitter=0.3,
                point_pos=-1.8,
                custom_data=df_right_label.values,
                hover_template=hover_template
            ), row=row_n, col=col_n)

            df_wrong_label = df_wrong_pred.loc[df_wrong_pred.y_pred == label]
            fig.add_trace(viz.get_box_plot_trace(
                y=df_wrong_label[feature],
                name=f'|label: {label} - predicción incorrecta|',
                marker_color='red',
                box_mean=True,
                box_points='all', # can also be outliers/suspectedoutliers/False
                jitter=0.3,
                point_pos=-1.8,
                custom_data=df_wrong_label.values,
                hover_template=hover_template
            ), row=row_n, col=col_n)
        fig.update_xaxes(title_text="y_true", row=row_n, col=col_n)
        fig.update_yaxes(title_text=feature, row=row_n, col=col_n)

        if idx % 2 != 0:
            row_n += 1

        col_n = 1 if col_n == 2 else 2

    fig.update_layout(
        #title=('Distribution of features: True Positives and False '
        #       +'Positives by ground truth label'),
        title=('Distribución de probabilidades asignadas a la categoría elegida '
              + 'para casos predichos correcta e incorrectamente'),
        yaxis=dict(
            autorange=True,
            showgrid=True,
            zeroline=True,
            gridcolor='rgb(255, 255, 255)',
            gridwidth=1,
            zerolinecolor='rgb(255, 255, 255)',
            zerolinewidth=2,
        ),
        margin=dict(
            l=40,
            r=30,
            b=80,
            t=100,
        ),
        showlegend=show_legend
    )
    fig.show()


In [14]:
path1 = '/home/ifranco/Documents/facultad/tesis/tf_real_estate_images_classification//models/CNN_Model/exp_2__vision_based_dataset__2019-11-28__00_45/'
dataset1 = 'vision_based_dataset'

path2 = '/home/ifranco/Documents/facultad/tesis/tf_real_estate_images_classification//models/PlacesOntop_Model/2020-01-16__19_13__exp6/'
dataset2 = 'clahe_vision_based_dataset'

In [15]:

for path, dataset, exp in [(path1, dataset1, '2'), (path2, dataset2, '6')][:1]:
    model, test_generator, validation_generator, test_dataset_path = load_model(path, dataset)

    df_pred_test = get_df_pred(model, test_generator, test_dataset_path)

    d = print_pred_proba_error_by_category(df_pred_test)

    d = print_error_by_category(df_pred_test)

    errors, pct, rights, counts = get_error_by_threshold(df_pred_test)

    plot_errors_and_pct_series(pct, rights, counts, exp)
    
    plot_distribution_of_features_by_label(df_pred_test, features=['y_pred_proba'], exp=exp)
    
    print()
    print()
    print()
    print()

W0118 01:46:35.008199 139803828860736 deprecation.py:506] From /home/ifranco/Documents/facultad/tesis/tesis_env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0118 01:46:35.009297 139803828860736 deprecation.py:506] From /home/ifranco/Documents/facultad/tesis/tesis_env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0118 01:46:35.033697 139803828860736 deprecation.py:506] From /home/ifranco/Documents/facultad/tesis/tesis_env/lib/python3.6/site-packages/t

Found 6305 images belonging to 6 classes.
Found 6328 images belonging to 6 classes.
Found 114361 images belonging to 6 classes.
model best weights file >> weights.04-1.00.hdf5
Found 6305 images belonging to 6 classes.
99.0
\begin{tabular}{lrrrrr}
\toprule
      y\_true & \multicolumn{5}{l}{y\_pred\_proba} \\
             &         mean & median &   std &   min &   max \\
\midrule
    bathroom &         0.63 &   0.61 &  0.20 &  0.26 &  1.00 \\
     bedroom &         0.64 &   0.62 &  0.18 &  0.26 &  1.00 \\
 dining\_room &         0.67 &   0.64 &  0.19 &  0.26 &  1.00 \\
   frontyard &         0.63 &   0.62 &  0.19 &  0.26 &  0.99 \\
     kitchen &         0.66 &   0.63 &  0.19 &  0.31 &  1.00 \\
  livingRoom &         0.69 &   0.70 &  0.19 &  0.26 &  1.00 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
      y\_true &    0 &       pct \\
\midrule
   frontyard &  100 &  0.060533 \\
    bathroom &  182 &  0.110169 \\
 dining\_room &  220 &  0.133172 \\
     kitchen &  269 &  0







In [16]:
for path, dataset, exp in [(path1, dataset1, '2'), (path2, dataset2, '6')][1:]:
    model, test_generator, validation_generator, test_dataset_path = load_model(path, dataset)

    df_pred_test = get_df_pred(model, test_generator, test_dataset_path)

    d = print_pred_proba_error_by_category(df_pred_test)

    d = print_error_by_category(df_pred_test)

    errors, pct, rights, counts = get_error_by_threshold(df_pred_test)

    plot_errors_and_pct_series(pct, rights, counts, exp)
    
    plot_distribution_of_features_by_label(df_pred_test, features=['y_pred_proba'], exp=exp)
    
    print()
    print()
    print()
    print()

Found 6305 images belonging to 6 classes.
Found 6328 images belonging to 6 classes.
Found 114355 images belonging to 6 classes.
model best weights file >> weights.33-1.03.hdf5
Found 6305 images belonging to 6 classes.
50.0
\begin{tabular}{lrrrrr}
\toprule
      y\_true & \multicolumn{5}{l}{y\_pred\_proba} \\
             &         mean & median &   std &   min &   max \\
\midrule
    bathroom &         0.56 &   0.53 &  0.19 &  0.19 &  1.00 \\
     bedroom &         0.59 &   0.56 &  0.19 &  0.25 &  1.00 \\
 dining\_room &         0.60 &   0.57 &  0.18 &  0.20 &  1.00 \\
   frontyard &         0.55 &   0.50 &  0.20 &  0.19 &  0.98 \\
     kitchen &         0.61 &   0.59 &  0.19 &  0.23 &  1.00 \\
  livingRoom &         0.61 &   0.59 &  0.19 &  0.22 &  1.00 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
      y\_true &    0 &       pct \\
\midrule
   frontyard &  145 &  0.064046 \\
    bathroom &  148 &  0.065371 \\
 dining\_room &  356 &  0.157244 \\
     kitchen &  504 &  0





