# Loading

In [None]:
import sys
import os

sys.path.insert(0, os.path.join(os.path.abspath(os.path.curdir), os.path.pardir))

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import cv2
import numpy as np
import ipywidgets as widgets
import pandas as pd
import seaborn as sns
import json
import tqdm
import matplotlib


from ast import literal_eval
from ipywidgets import interact, interact_manual
from functools import lru_cache
from itertools import chain

from pathlib import Path
from process_files import process_single_image_path
from common import get_train_test_split_from_paths

In [None]:
font = {
    'size': 22,
    'family': 'sans-serif',
#     'weight': 'bold'
}

matplotlib.rc('font', **font)
# plt.rc('text', usetex=True)

sns.set_style('whitegrid')
sns.set_context('talk')

In [None]:
data_path = Path('/data/Datasets/usg')

files = list([path for path in Path(data_path).rglob('*.tif') if not path.name.startswith('.')])
files = list(sorted(files, key=lambda x: x.parent.parent.name))
files_widget = widgets.Dropdown(
    options=files,
    index=0,
    description="File:"
)

# Data Extraction

In [None]:
def show_pair_of_images(img_1, img_2):
    fig, ax = plt.subplots(1, 2, figsize=(16, 12))
    fig.tight_layout()
    [axi.set_axis_off() for axi in ax.ravel()]
    ax[0].imshow(img_1)
    ax[1].imshow(img_2)
    plt.show()

In [None]:
def get_img_data(a_file, output_dir):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    data = process_single_image_path(a_file, output_dir)
    lower, upper, radial_polar_area, masked_circle, coordinates, left_side_annotation, legend_bar_annotation, lower_right_annotation = data
    
    frame = pd.DataFrame(
        data=literal_eval(left_side_annotation) 
        + literal_eval(legend_bar_annotation )
        + literal_eval(lower_right_annotation)
    )
    
    frame = frame[["text", "x", "y", "w", "h"]]
    
    return lower, upper, radial_polar_area, masked_circle, frame

In [None]:
def display_img_data(a_file):
    lower, upper, radial_polar_area, masked_circle, frame = get_img_data(a_file, "output")
    show_pair_of_images(lower, upper)
    show_pair_of_images(radial_polar_area, masked_circle)
    display(frame)

In [None]:
interact_manual(display_img_data, a_file=files_widget)

# Additional Data Preprocessing

In [None]:
smoothing_value_widget = widgets.IntSlider(
    value=1,
    min=1,
    max=41,
    description="Smoothing: ",
    continuous_update=False,
)

In [None]:
def display_single_image(img):
    plt.figure(figsize=(16, 12))
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.show()

In [None]:
lower, radial, circle = None, None, None
def load(a_file):
    global lower, radial, circle
    lower, _, radial, circle, _ = get_img_data(a_file, "output")
    print("Loaded!")
    
interact(load, a_file=files_widget)

In [None]:
def visualise_smoothing(smoothing):
    cur_lower, cur_radial, cur_circle = lower.copy(), radial.copy(), circle.copy()
    cur_lower = cv2.fastNlMeansDenoising(cur_lower, h=smoothing)
    cur_radial = cv2.fastNlMeansDenoising(cur_radial, h=smoothing)
    cur_circle = cv2.fastNlMeansDenoising(cur_circle, h=smoothing)
    
    show_pair_of_images(cur_radial, cur_circle)
    display_single_image(cur_lower)

In [None]:
interact(visualise_smoothing, smoothing=smoothing_value_widget)

# Data Description

In [None]:
data_folder = Path('/data/Datasets/usg-kaggle')

data_paths = list(
    chain(
        (Path(data_folder) / "train" / "0").glob("*"),
        (Path(data_folder) / "train" / "1").glob("*")
    )
)
data_paths = list(sorted(data_paths, key=lambda x: int(x.name)))

classes = np.asarray([int(path.parent.name) for path in data_paths])

_, valid_paths = get_train_test_split_from_paths(data_paths, classes)

f0_regression_values = []
f4_regression_values = []
for path in tqdm.tqdm_notebook(valid_paths):
    a_cls = int(path.parent.name)
    list_to_append = f4_regression_values if a_cls == 1 else f0_regression_values
    list_to_append.append(
        json.loads(
            (path / "regression_ground_truth.json").read_text()
        )["mean"]
    )


classification_f0 = len([
    f for f in list(Path('/data/Datasets/usg/F0/').glob("*"))
    if not f.name.startswith('.')
])
classification_f4 = len([
    f for f in list(Path('/data/Datasets/usg/F4/').glob("*"))
    if not f.name.startswith('.')
])

classification_real_dist = pd.DataFrame(data={
    "count": [classification_f0, classification_f4],
    "name": ["F0", "F4"]
})

classification_used_dist = pd.DataFrame(data={
    "count": [len(classes[classes == 1]), len(classes[classes == 0])],
    "name": ["F0", "F4"]
})

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(24, 6))
sns.barplot(x="name", y="count", data=classification_real_dist, ax=ax[0])
sns.barplot(x="name", y="count", data=classification_used_dist, ax=ax[1])
sns.distplot(f0_regression_values, hist=False, norm_hist=True, ax=ax[2], label="F0")
sns.distplot(f4_regression_values, hist=False, norm_hist=True, ax=ax[2], label="F4")
ax[0].set_title('Classes of all available data')
ax[1].set_title('Classes of data used for validation')
ax[2].set_title('Distribution of QBox mean values for each class')
plt.legend(loc='best')
plt.show()
print()

In [None]:
f0_processed_count = 2265 
f4_processed_count = 2340
unknown_processed_count = 599
f0_raw_count = 2861
f4_raw_count = 4692

# Model evaluation - statistics

In [None]:
results_frame = pd.read_csv('predictions.csv')
results_frame.keys()

In [None]:
mae = np.mean(np.abs(results_frame['reg_true'] - results_frame['reg_mean_pred']))
print('MAE: {:.4f}, std: {:.4f}'.format(mae, results_frame['reg_std_pred'].mean()))

In [None]:
pred_classes = np.argmax(np.stack([
    results_frame["f0_mean_pred"],
    results_frame["f4_mean_pred"]
], axis=-1), axis=1)
acc = np.mean(results_frame["cls_true"] == pred_classes)
print("Accuracy: {:.4f}".format(acc))

In [None]:
def show_roc(fpr, tpr, roc_auc):
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
def show_specifity_sensitivity(specifity, sensitivity, tresholds):
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(thresholds, specifity, color='darkorange', lw=lw, label='Speciftity')
    plt.plot(thresholds, sensitivity, color='darkblue', lw=lw, label='Sensitivity')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Probability threshold')
    plt.ylabel('Specifity or Sensitivity')
    plt.title('Specifity / Sensitivity Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    
def show_prec_recall_curve(precision, recall):
    plt.figure(figsize=(8, 6))
    lw = 2
    plt.plot(recall, precision, color='darkorange',
             lw=lw, label='Precision-Recall curve')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision recall curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, recall_score, precision_score

fpr, tpr, _ = roc_curve(results_frame["cls_true"], results_frame["f4_mean_pred"])
roc_auc = auc(fpr, tpr)
show_roc(fpr, tpr, roc_auc)

In [None]:
prec, recall, _ = precision_recall_curve(results_frame["cls_true"], results_frame["f4_mean_pred"])
show_prec_recall_curve(prec, recall)

In [None]:
def specifity_sensitivity_curve(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    return 1 - fpr, tpr, thresholds

In [None]:
specifity, sensitivity, thresholds = specifity_sensitivity_curve(
    results_frame["cls_true"], results_frame["f4_mean_pred"]
)
show_specifity_sensitivity(specifity, sensitivity, thresholds)

In [None]:
precision = precision_score(results_frame["cls_true"], pred_classes)
recall = recall_score(results_frame["cls_true"], pred_classes)
print('Precision: {:.4f} | Recall: {:.4f}'.format(precision, recall))

In [None]:
true_reg = results_frame['reg_true']
true_cls = results_frame['cls_true']

pred_reg = results_frame['reg_mean_pred']
pred_cls = np.argmax(np.stack([
    results_frame["f0_mean_pred"],
    results_frame["f4_mean_pred"]
], axis=-1), axis=1)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(24, 6), sharey=True)
sns.distplot(true_reg[true_cls == 0], hist=False, norm_hist=True, ax=ax[0], label="F0")
sns.distplot(true_reg[true_cls == 1], hist=False, norm_hist=True, ax=ax[0], label="F4")
sns.distplot(pred_reg[pred_cls == 0], hist=False, norm_hist=True, ax=ax[1], label="F0")
sns.distplot(pred_reg[pred_cls == 1], hist=False, norm_hist=True, ax=ax[1], label="F4")
ax[0].set_title('True distribution of QBox mean values'); ax[0].set_xlabel('QBox mean value')
ax[1].set_title('Predicted distribution of QBox mean values'); ax[1].set_xlabel('QBox mean value')
plt.legend(loc='best')
plt.show()
print()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(24, 6), sharey=True)
sns.distplot(true_reg[true_cls == 0], hist=False, norm_hist=True, ax=ax[0], label="True")
sns.distplot(pred_reg[pred_cls == 0], hist=False, norm_hist=True, ax=ax[0], label="Predicted")
sns.distplot(true_reg[true_cls == 1], hist=False, norm_hist=True, ax=ax[1], label="True")
sns.distplot(pred_reg[pred_cls == 1], hist=False, norm_hist=True, ax=ax[1], label="Predicted")
ax[0].set_title('QBox mean values distribution comparison of true and predicted for F0'); ax[0].set_xlabel('QBox mean value')
ax[1].set_title('QBox mean values distribution comparison of true and predicted for F4'); ax[1].set_xlabel('QBox mean value')
plt.legend(loc='best')
plt.show()
print()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
data = pd.DataFrame(data={
    'Class': ['F0', 'F0', 'F4', 'F4'],
    'Count': [(true_cls==0).sum(), (pred_cls==0).sum(), (true_cls==1).sum(), (pred_cls==1).sum()],
    'Type': ['True', 'Predicted', 'True', 'Predicted']
})
sns.barplot(x='Class', y='Count', hue='Type', data=data, ax=ax)
plt.legend(loc='best')

# Analysis of selected samples


In [None]:
min_mean = 0.4
max_mean = 0.6
min_std = 0.05
max_std = 0.3

In [None]:
def get_image_description(folder_path: str):
    a_row = results_frame[results_frame["path"] == folder_path].iloc[0]
    folder_path = Path(folder_path)
    low_img = cv2.imread((folder_path / "lower.png").as_posix(), cv2.IMREAD_GRAYSCALE)
    rectangle_area = cv2.imread((folder_path / "radial_polar_area.png").as_posix(), cv2.IMREAD_GRAYSCALE)
    circle = cv2.imread((folder_path / "circle.png").as_posix(), cv2.IMREAD_GRAYSCALE)
    
    plt.figure(figsize=(24, 6))
    plt.imshow(low_img, cmap='gray')
    plt.axis('off')
    plt.show()
    
    fig, ax = plt.subplots(1, 3, figsize=(24, 4))
    ax[0].imshow(rectangle_area, cmap='gray'); ax[0].set_axis_off()
    ax[1].imshow(circle, cmap='gray'); ax[1].set_axis_off()
    ax[2].text(0, 1, 
               'Expected mean value:\n'
               'Expected class:\n\n'
               'Predicted mean value | std:\n'
               'Confidence | std:\n'
               'Predicted class:\n',
                verticalalignment='center', fontsize=13
    ); ax[2].set_axis_off()
    
    predicted_class = 0 if a_row['f0_mean_pred'] > a_row['f4_mean_pred'] else 1
    predicted_class_mean = a_row['f0_mean_pred'] if predicted_class == 0 else a_row['f4_mean_pred']
    predicted_class_std = a_row['f0_std_pred'] if predicted_class == 0 else a_row['f4_std_pred']
    
    result_string = """
    {:.4f}
    {}
    
    {:.4f} | {:.4f}
    {:.4f} | {:.4f}
    {}
    """.format(
        json.loads((folder_path / "regression_ground_truth.json").read_text())["mean"],
        int(folder_path.parent.name),
        a_row["reg_mean_pred"], a_row["reg_std_pred"],
        predicted_class_mean, predicted_class_std,
        predicted_class
    )
    
    ax[2].text(0.9, 1,
              result_string,
              verticalalignment='center',
              horizontalalignment='right',
              fontsize=13)
    plt.show()

#### Wrong predictions with a high confidence

In [None]:
partial_frame_bad_predictions_high_confidence = results_frame.loc[
    results_frame["cls_true"] != pred_classes
].query(f"(f4_mean_pred > {max_mean} | f4_mean_pred < {min_mean}) & (f0_mean_pred > {max_mean} | f0_mean_pred < {min_mean})")
len(partial_frame_bad_predictions_high_confidence), (results_frame["cls_true"] != pred_classes).sum()

In [None]:
bad_predictions_high_confidence_paths_widget = widgets.Dropdown(
    options=partial_frame_bad_predictions_high_confidence["path"].tolist(),
    index=0
)

interact(get_image_description, folder_path=bad_predictions_high_confidence_paths_widget)

#### Wrong predictions with a low confidence

In [None]:
partial_frame_bad_predictions_low_confidence = results_frame.loc[
    results_frame["cls_true"] != pred_classes
].query(f"(f4_mean_pred < {max_mean} & f4_mean_pred > {min_mean} ) | (f0_mean_pred < {max_mean} & f0_mean_pred > {min_mean})")

In [None]:
bad_predictions_low_confidence_paths_widget = widgets.Dropdown(
    options=partial_frame_bad_predictions_low_confidence["path"].tolist(),
    index=0
)

interact(get_image_description, folder_path=bad_predictions_low_confidence_paths_widget)

#### Good predictions with a low confidence

In [None]:
partial_frame_good_predictions_low_confidence = results_frame.loc[
    results_frame["cls_true"] == pred_classes
].query(f"(f4_mean_pred < {max_mean} & f4_mean_pred > {min_mean}) | (f0_mean_pred < {max_mean} & f0_mean_pred > {min_mean})")

In [None]:
good_predictions_low_confidence_paths_widget = widgets.Dropdown(
    options=partial_frame_good_predictions_low_confidence["path"].tolist(),
    index=0
)

interact(get_image_description, folder_path=good_predictions_low_confidence_paths_widget)