### Initial Data exploration and wrangling

##### Module import

In [1]:
import pandas as pd
import umap
from sklearn import preprocessing
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import ast
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, f1_score, log_loss
from sklearn import metrics
from imblearn.metrics import geometric_mean_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
import os
from PIL import Image
import math
import hdbscan
from sklearn.preprocessing import minmax_scale
import shutil

# define random seed
np.random.seed(42)

# define plot size
width=1100
height=1100


#### Dataset import and wrangling

In [2]:
path = 'D:/github/2dv50e/Data_train_test_split/1. Heart Disease/'
df_prob = pd.read_csv(path + 'topModelsProbabilities.csv')
df_mod = pd.read_csv(path + 'topModels.csv')

df_target_train = pd.read_csv(path + 'targetTrain.csv')
y_train = df_target_train.copy()
df_target_test = pd.read_csv(path + 'targetTest.csv')
y_test = df_target_test.copy()
df_source_train = pd.read_csv(path + 'datasetTrain.csv')
df_source_test = pd.read_csv(path + 'datasetTest.csv')

# Apply scaler to df_source
scaler = preprocessing.StandardScaler()

df_source_train_scaled = pd.DataFrame(scaler.fit_transform(df_source_train), columns=df_source_train.columns)
x_train = df_source_train_scaled.copy()
df_source_test_scaled = pd.DataFrame(scaler.fit_transform(df_source_test), columns=df_source_test.columns)
x_test = df_source_test_scaled.copy()

algo_nr = df_mod['algorithm_id']


In [3]:
# Print number of missing values per column in df_probabilities
for col in df_prob.columns:
    if df_prob[col].isnull().sum() > 0:
        print(f'{col} has {df_prob[col].isnull().sum()} missing values.')

In [4]:
algos = {1:'K-Nearest Neighbor', 2:'Support Vector Machine', 3:'Gaussian Naive Bayes', 4:'Multilayer Perceptron', 5:'Logistic Regression',
        6:'Linear Discriminant Analysis', 7:'Quadratic Discriminant Analysis', 8:'Random Forest', 9:'Extra Trees', 10:'Adaptive Boosting',
        11:'Gradient Boosting'}

symbols = ['circle', 'square', 'x', 'cross', 'diamond', 'star', 'hexagram', 'triangle-right', 'triangle-left', 'triangle-down', 'triangle-up']

In [5]:
df_model = df_mod.copy()
# rename columns in df_model
df_model.rename(columns={'params': 'hyperparameters', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_weighted': 'precision', 
                        'mean_test_recall_weighted': 'recall', 'mean_test_roc_auc_ovo_weighted': 'roc_auc_score',
                        'geometric_mean_score_weighted': 'geometric_mean_score'}, inplace=True)
# remove overall perfromance column
df_model.drop(columns=['overall_performance'], inplace=True)
df_model['algorithm_name'] = df_model['algorithm_id'].map(algos)
df_model['overall_performance'] = round((df_model['accuracy'] + df_model['precision'] + df_model['recall'] + df_model['roc_auc_score'] + \
                                df_model['geometric_mean_score'] + df_model['matthews_corrcoef'] + df_model['f1_weighted']) / 7, 2)
# Sort columns
df_model = df_model[['model_id', 'algorithm_id', 'algorithm_name', 'accuracy', 'precision', 'recall', 'roc_auc_score',
                        'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted', 'log_loss', 'overall_performance', 'hyperparameters']]


# create new pandas series with total number of models
prob_base = []

for i in range(df_prob.shape[0]):
    prob = []
    # calculate the confidence interval for each algorithm
    for n in range(df_prob.shape[1]):
            prob.append(df_prob.iloc[i, n])
    prob_average = np.mean(prob).round(2)
    prob_base.append(prob_average)
    # add prob_meta to df_model_meta as new column "average_probability"
    df_model['average_probability'] = pd.Series(prob_base)

df_model['rank'] = df_model['overall_performance'] * df_model['average_probability'] / 10000
# convert mathhews_corrcoef to absolute values
df_model['matthews_corrcoef'] = df_model['matthews_corrcoef'].abs()

Function to extract hyperparameters from the best performing model per algorithm

In [6]:
def best_params(df):
      # Select hyperparameters for best model in each algorithm
      df_best = df.groupby('algorithm_id').apply(lambda x: x.sort_values('overall_performance', ascending=False).iloc[0])
      # reset algorithm_nr as  index
      df_best = df_best.reset_index(drop=True)
      # keep only algorithm number, name, performance and hyperparameters
      df_best = df_best[['algorithm_id', 'algorithm_name', 'overall_performance', 'hyperparameters']]
      # rename overall_performance as performance
      df_best.rename(columns={'overall_performance': 'performance'}, inplace=True)
      return df_best

Supporting functions to extract hyperparameters values

In [7]:
# Return hyperparameters based on algorithm name
def get_hyperparameters(df, algorithm_name):
    return df[df['algorithm_name'] == algorithm_name]['hyperparameters']

# return value of key in dictionary
def get_value(dictionary, key):
    return dictionary[key]

# convert string to dictionary
def string_to_dict(string):
    return ast.literal_eval(string)

##### Metamodel estimators (top 1 per algorithm)
Hyperparameters will be extracted from the best performing model per algorithm. <br> The plan is to extract hyperparameters from the best performing model per algorithm and then apply these hyperparameters to the metamodels. Currently no functuonality to be added to allow end user to tune the hyperparameters for metamodels, instead they will rely on top performing ones from base layer models.

In [8]:
# Extract the top models from the dataframe with the respective hyperparameters
meta_params = best_params(df_model)

# return unique values of algorithm name
algo = ['knn', 'svm', 'gnb', 'mlp', 'lr', 'lda', 'qda', 'rf', 'et', 'ab', 'gb']
# convert all values in algo to capital letters
algo_cap = [i.upper() for i in algo]

algo_names = meta_params['algorithm_name'].copy()

# iterate through key and value pairs in algo
i = 0
for k,v in zip(algo, algo_names):
    # Define hyperparameters for each  final estimator, based on top performing model from base layer
    temp = get_hyperparameters(meta_params, v)
    # convert to dictionary for easy access
    temp_dict = {int(k):v for k,v in temp.items()}
    # return first value
    temp_dict = temp_dict[i]
    # convert to dictionary for easy access, assign varibale name per algorithm
    globals()[k + '_best_params'] = ast.literal_eval(temp_dict)
    i += 1

# Add final estimators
final_estimators = [
                ('knn', KNeighborsClassifier(algorithm=get_value(knn_best_params, 'algorithm'), metric=get_value(knn_best_params, 'metric'), 
                n_neighbors= get_value(knn_best_params, 'n_neighbors'), weights=get_value(knn_best_params, 'weights'), n_jobs=-1)),
                ('svm', SVC(C=get_value(svm_best_params, 'C'), kernel=get_value(svm_best_params, 'kernel'),probability=True, random_state=42)),
                ('gnb', GaussianNB(var_smoothing=get_value(gnb_best_params, 'var_smoothing'))),
                ('mlp', MLPClassifier(activation=get_value(mlp_best_params, 'activation'), alpha=get_value(mlp_best_params, 'alpha'), 
                max_iter=get_value(mlp_best_params, 'max_iter'), solver=get_value(mlp_best_params, 'solver'), tol=get_value(mlp_best_params, 'tol'), 
                random_state=42)),
                ('lr', LogisticRegression(C=get_value(lr_best_params, 'C'), max_iter=get_value(lr_best_params, 'max_iter'),
                penalty=get_value(lr_best_params, 'penalty'), solver=get_value(lr_best_params, 'solver'), random_state=42, n_jobs=-1)),
                ('lda', LinearDiscriminantAnalysis(shrinkage=get_value(lda_best_params, 'shrinkage'), solver=get_value(lda_best_params, 'solver'))),
                ('qda', QuadraticDiscriminantAnalysis(reg_param=get_value(qda_best_params, 'reg_param'), tol=get_value(qda_best_params, 'tol'))),
                ('rf', RandomForestClassifier(criterion=get_value(rf_best_params, 'criterion'), n_estimators=get_value(rf_best_params, 'n_estimators'), 
                random_state=42, n_jobs=-1)),
                ('et', ExtraTreesClassifier(criterion=get_value(et_best_params, 'criterion'), n_estimators=get_value(et_best_params, 'n_estimators'), 
                random_state=42, n_jobs=-1)),
                ('ab', AdaBoostClassifier(algorithm=get_value(ab_best_params, 'algorithm'), learning_rate=get_value(ab_best_params, 'learning_rate'), 
                n_estimators=get_value(ab_best_params, 'n_estimators'), random_state=42)),
                ('gb', GradientBoostingClassifier(learning_rate=get_value(gb_best_params, 'learning_rate'), 
                n_estimators=get_value(gb_best_params, 'n_estimators'), random_state=42))
                ]

### HDBSCAN

Testing on probabilities dataset

In [9]:
# create new dataframe with with columns min_cluster_size, min_samples and number_outliers
df_cluster = pd.DataFrame(columns=['min_cluster_size', 'min_samples', 'metric', 'n_clusters', 'DBVC', 'Coverage'])

row = 0

# Apply hdbscan to df_prob
for i in [3,4,5,6, 7, 8]:
    for j in [5, 10, 15, 20, 25, 30, 40, 50]:
        for metric in ['euclidean', 'manhattan', 'chebyshev']:
            clusterer = hdbscan.HDBSCAN(min_cluster_size=i, min_samples=j, metric=metric, gen_min_span_tree=True)
            clusterer.fit(df_prob)
            labels = clusterer.labels_
            cnts = pd.DataFrame(labels)[0].value_counts()
            cnts = cnts.reset_index()
            cnts.columns = ['cluster','count']
            n_cluster = cnts.cluster.nunique()
            # get DBVC
            DBVC = clusterer.relative_validity_

            clustered = (labels >= 0)

            coverage = np.sum(clustered) / df_prob.shape[0]
            # add values to dataframe
            df_cluster.loc[row] = [i, j, metric, n_cluster, DBVC, coverage]

            row += 1

# add column with multiplication of DBVC and coverage
df_cluster['DBVC_Coverage'] = df_cluster['DBVC'] * df_cluster['Coverage']
# sort dataframe by DBVC_coverage
df_cluster = df_cluster.sort_values('DBVC_Coverage', ascending=False)
# group by DBVC_coverage
df_cluster = df_cluster.groupby('DBVC_Coverage').apply(lambda x: x.sort_values('DBVC', ascending=False).iloc[0])
# drop index and 
df_cluster = df_cluster.reset_index(drop=True)

# # keep only rows with number of clusters 4 or 5
# df_cluster = df_cluster[df_cluster['n_clusters'].isin([4,5])]
# sort by DBVC_coverage
df_cluster = df_cluster.sort_values('DBVC_Coverage', ascending=False)


# get min_cluster_size and min_samples and from top row
min_cluster_size = int(df_cluster.iloc[0]['min_cluster_size'])
min_samples = int(df_cluster.iloc[0]['min_samples'])
# get metric from top row
metric = df_cluster.iloc[0]['metric']

clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metric, gen_min_span_tree=True)
clusterer.fit(df_prob)
labels = clusterer.labels_

cnts = pd.DataFrame(labels)[0].value_counts()
cnts = cnts.reset_index()
cnts.columns = ['cluster','count']
# sort by cluster
cnts = cnts.sort_values('cluster')

# add labels to df_model
df_model['labels'] = labels
df_prob['labels'] = labels


In [10]:
test = df_cluster.head().style.set_properties(**{'text-align': 'center'}).hide_index()
test

min_cluster_size,min_samples,metric,n_clusters,DBVC,Coverage,DBVC_Coverage
3,5,manhattan,5,0.2066,0.636364,0.131473
4,5,manhattan,4,0.171827,0.581818,0.099972
3,5,chebyshev,4,0.196133,0.472727,0.092718
8,5,chebyshev,3,0.155424,0.4,0.06217
5,5,euclidean,3,0.09238,0.418182,0.038632


In [11]:
df_cluster.head()

Unnamed: 0,min_cluster_size,min_samples,metric,n_clusters,DBVC,Coverage,DBVC_Coverage
9,3,5,manhattan,5,0.2066,0.636364,0.131473
8,4,5,manhattan,4,0.171827,0.581818,0.099972
7,3,5,chebyshev,4,0.196133,0.472727,0.092718
6,8,5,chebyshev,3,0.155424,0.4,0.06217
5,5,5,euclidean,3,0.09238,0.418182,0.038632


In [12]:
for i in range(0, len(cnts)):
    if cnts['cluster'][i] == -1:
        cnts['cluster'][i] = '-1 (outliers)'

cnts
# grey out row with outliers
cnts_style = cnts.style.apply(lambda x: ['background: lightgrey' if x.cluster == '-1 (outliers)' else '' for i in x], axis=1)
# do not show index
cnts_style = cnts_style.set_properties(**{'text-align': 'center'}).hide_index()

cnts_style

cluster,count
-1 (outliers),20
0,3
1,10
2,12
3,10


In [13]:
# sort dataframes by labels
df_model = df_model.sort_values('labels')
df_prob = df_prob.reindex(df_model.index)

# reset index for df_model and df_prob
df_model = df_model.reset_index(drop=True)
df_prob = df_prob.reset_index(drop=True)


In [14]:
# create empty dictionaries to store results fro all clusters of basemodels and metamodels results
df_model_dict = {}
df_prob_dict = {}
df_pred_dict = {}

df_model_dict_meta = {}
df_prob_dict_meta = {}
df_pred_dict_meta = {}

algo_dict = {}
algo_names_dict = {}

# create new dictionary with df_model name and content
df_model_dict = {'df_model_all': df_model}
df_model_dict_meta = {'df_model_all_meta': None}

# save df_model subsets based on labels
for label in df_model.labels.unique():
    if label == -1:
        # append to dictionary
        df_model_dict['df_model_outliers'] = df_model[df_model['labels'] == -1]
        df_model_dict_meta['df_model_outliers_meta'] = None
    else:
        # append to dictionary
        df_model_dict[f'df_model_cluster_{label}'] = df_model[df_model['labels'] == label]
        df_model_dict_meta[f'df_model_cluster_{label}_meta'] = None


# create new dictionary with df_prob name and content
df_prob_dict = {'df_prob_all': df_prob}
df_prob_dict_meta = {'df_prob_all_meta': None}

# save df_prob subsets based on labels
for label in df_prob.labels.unique():
    if label == -1:
        # append to dictionary
        df_prob_dict['df_prob_outliers'] = df_prob[df_prob['labels'] == -1]
        df_prob_dict_meta['df_prob_outliers_meta'] = None
    else:
        # append to dictionary
        df_prob_dict[f'df_prob_cluster_{label}'] = df_prob[df_prob['labels'] == label]
        df_prob_dict_meta[f'df_prob_cluster_{label}_meta'] = None

# drop label column from all dataframes in df_prob_dict
for key in df_prob_dict.keys():
    df_prob_dict[key] = df_prob_dict[key].drop(columns=['labels'])


# create new dictionary with predicitons
df_pred_dict = {'df_pred_all': None}
df_pred_dict_meta = {'df_pred_all_meta': None}
# save df_prob subsets based on labels
for label in df_prob.labels.unique():
    if label == -1:
        # append to dictionary
        df_pred_dict['df_pred_outliers'] = None
        df_pred_dict_meta['df_pred_outliers_meta'] = None
    else:
        # append to dictionary
        df_pred_dict[f'df_pred_cluster_{label}'] = None
        df_pred_dict_meta[f'df_pred_cluster_{label}_meta'] = None

# create new dictionary with predicitons
algo_dict = {'algo_all': None}
# save df_prob subsets based on labels
for label in df_prob.labels.unique():
    if label == -1:
        # append to dictionary
        algo_dict['algo_outliers'] = None
    else:
        # append to dictionary
        algo_dict[f'algo_cluster_{label}'] = None

# create new dictionary with predicitons
algo_names_dict = {'algo_all': None}
# save df_prob subsets based on labels
for label in df_prob.labels.unique():
    if label == -1:
        # append to dictionary
        algo_names_dict['algo_outliers'] = None
    else:
        # append to dictionary
        algo_names_dict[f'algo_cluster_{label}'] = None

In [15]:
for key_model, key_prob, key_pred, key_algo, key_name_algo in zip (df_model_dict.keys(), df_prob_dict.keys(), 
                                                                df_pred_dict.keys(), algo_dict.keys(), algo_names_dict.keys()):
    print(key_model)
    print(df_model_dict[key_model].shape)

    # dictionaries with algorithm names and their hyperparameters for all models
    knn_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'K-Nearest Neighbor').items()}
    svm_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Support Vector Machine').items()}
    gnb_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Gaussian Naive Bayes').items()}
    mlp_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Multilayer Perceptron').items()}
    lr_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Logistic Regression').items()}
    lda_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Linear Discriminant Analysis').items()}
    qda_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Quadratic Discriminant Analysis').items()}
    rf_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Random Forest').items()}
    et_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Extra Trees').items()}
    ab_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Adaptive Boosting').items()}
    gb_params = {int(k):v for k,v in get_hyperparameters(df_model_dict[key_model], 'Gradient Boosting').items()}

    # Estimators and hyperparameters for each algorithm per df_model_cluster

    estimators = []
    # populate list of estimators with all 55 top models
    count = 0
    for i in list(knn_params.keys()):
                    temp_dict = ast.literal_eval(knn_params[i])
                    estimators.append((f'knn_{count}', KNeighborsClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                    metric=get_value(temp_dict, 'metric'), n_neighbors= get_value(temp_dict, 'n_neighbors'), 
                                    weights=get_value(temp_dict, 'weights'), n_jobs=-1)))
                    count += 1
    count = 0
    for i in list(svm_params.keys()):
                    temp_dict = ast.literal_eval(svm_params[i])
                    estimators.append((f'svm_{count}', SVC(C=get_value(temp_dict, 'C'), kernel=get_value(temp_dict, 'kernel'),probability=True, random_state=42)))
                    count += 1
    count = 0
    for i in list(gnb_params.keys()):
                    temp_dict = ast.literal_eval(gnb_params[i])
                    estimators.append((f'gnb_{count}', GaussianNB(var_smoothing=get_value(temp_dict, 'var_smoothing'))))
                    count += 1
    count = 0
    for i in list(mlp_params.keys()):
                    temp_dict = ast.literal_eval(mlp_params[i])
                    estimators.append((f'mlp_{count}', MLPClassifier(activation=get_value(temp_dict, 'activation'), 
                                    alpha=get_value(temp_dict, 'alpha'),
                                    max_iter=get_value(temp_dict, 'max_iter'), solver=get_value(temp_dict, 'solver'), 
                                    tol=get_value(temp_dict, 'tol'), random_state=42)))
                    count += 1
    count = 0
    for i in list(lr_params.keys()):
                    temp_dict = ast.literal_eval(lr_params[i])
                    estimators.append((f'lr_{count}', LogisticRegression(C=get_value(temp_dict, 'C'), max_iter=get_value(temp_dict, 'max_iter'), penalty=get_value(temp_dict, 'penalty'), solver=get_value(temp_dict, 'solver'), random_state=42, n_jobs=-1)))
                    count += 1
    count = 0
    for i in list(lda_params.keys()):
                    temp_dict = ast.literal_eval(lda_params[i])
                    estimators.append((f'lda_{count}', LinearDiscriminantAnalysis(shrinkage=get_value(temp_dict, 'shrinkage'), 
                                    solver=get_value(temp_dict, 'solver'))))
                    count += 1
    count = 0
    for i in list(qda_params.keys()):
                    temp_dict = ast.literal_eval(qda_params[i])
                    estimators.append((f'qda_{count}', QuadraticDiscriminantAnalysis(reg_param=get_value(temp_dict, 'reg_param'), tol=get_value(temp_dict, 'tol'))))
                    count += 1
    count = 0
    for i in list(rf_params.keys()):
                    temp_dict = ast.literal_eval(rf_params[i])
                    estimators.append((f'rf_{count}', RandomForestClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                    n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                    count += 1
    count = 0
    for i in list(et_params.keys()):
                    temp_dict = ast.literal_eval(et_params[i])
                    estimators.append((f'et_{count}', ExtraTreesClassifier(criterion=get_value(temp_dict, 'criterion'),
                                    n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                    count += 1
    count = 0
    for i in list(ab_params.keys()):
                    temp_dict = ast.literal_eval(ab_params[i])
                    estimators.append((f'ab_{count}', AdaBoostClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                    learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                    count += 1
    count = 0
    for i in list(gb_params.keys()):
                    temp_dict = ast.literal_eval(gb_params[i])
                    # update criterion as mae is deprecated
                    if get_value(temp_dict, 'criterion') == 'mae':
                        estimators.append((f'gb_{count}', GradientBoostingClassifier(criterion='squared_error', 
                                    learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                    else:
                        estimators.append((f'gb_{count}', GradientBoostingClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                    learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                    count += 1

    
    # Please note that for the average metric we did not include the logloss as using normalized version of logloss introduces bias to dataset

    # create dataframe for df_model_meta with columns names from df_model
    df_model_meta = pd.DataFrame(columns=df_model.columns)
    # drop average probability and rank columns from df_model_meta
    df_model_meta = df_model_meta.drop(['average_probability', 'rank'], axis=1)

    # create dataframes for meta models probabilities and predicted values
    df_pred_meta = pd.DataFrame()
    df_prob_meta = pd.DataFrame()

    for x in range(0, len(final_estimators)):
            final_estimator = final_estimators[x][1]
            clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1, cv=None)
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_test)
            y_pred = pd.Series(y_pred)

            # probabilities dataframe
            y_pred_prob = clf.predict_proba(x_test)
            y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['prob_0', 'prob_1'])
            y_pred_prob_df['target'] = y_test
            y_pred_prob_df['predicted'] = y_pred
            # create new column, set value to prob_0 if target is 0 and prob_1 if target is 1
            y_pred_prob_df['pred_prob'] = np.where(y_pred_prob_df['target'] == 0, y_pred_prob_df['prob_0'], y_pred_prob_df['prob_1'])
            # remove prob_0 and prob_1 columns, target and predicted columns
            y_pred_prob_df = y_pred_prob_df.drop(['prob_0', 'prob_1', 'target', 'predicted'], axis=1)
            # transpose the data frame and convert values to %
            y_pred_prob_df = y_pred_prob_df.T
            y_pred_prob_df = y_pred_prob_df.apply(lambda x: x * 100).round(2)
            # set index to x
            y_pred_prob_df.index = [x]
            # add row to df_prob_meta using pd.concat
            df_prob_meta = pd.concat([df_prob_meta, y_pred_prob_df], axis=0)
            
            # prediction dataframe
            # transpose the data frame
            y_pred_df = pd.DataFrame(y_pred, columns=['Pred_class'])
            y_pred_df = y_pred_df.T
            # set index to x
            y_pred_df.index = [x]
            # add row to df_prob_meta using pd.concat
            df_pred_meta = pd.concat([df_pred_meta, y_pred_df], axis=0)

            # performance metrics dataframe
            accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
            precision = round(precision_score(y_test, y_pred, average='weighted')*100, 2)
            recall = round(recall_score(y_test, y_pred, average='weighted')*100, 2)
            roc_auc = round(roc_auc_score(y_test, y_pred, average='weighted')*100, 2)
            gmean = round(geometric_mean_score(y_test, y_pred, average='weighted')*100, 2)
            mcc = round(matthews_corrcoef(y_test, y_pred)*100, 2)
            f1_weighted = round(f1_score(y_test, y_pred, average='weighted')*100, 2)
            log_loss = round(metrics.log_loss(y_test, y_pred, normalize=True)*100, 2)
            average_metrics = (accuracy + precision + recall + roc_auc + gmean + mcc + f1_weighted) / 7
            average_metrics = round(average_metrics, 2)
            # add performance metrics to df_model_meta using pd.concat with index
            df_model_meta = pd.concat([df_model_meta, pd.DataFrame([['meta', x+1, algos[x+1], accuracy, precision, recall, roc_auc, gmean, \
                            mcc, f1_weighted, log_loss, average_metrics, f'{final_estimator.get_params()}', 'meta']], 
                            columns=df_model_meta.columns, index=[x])], axis=0)
    df_model_meta['matthews_corrcoef'] = df_model_meta['matthews_corrcoef'].abs()

    # Adding average probability for each model

    df_prob_meta_t = df_prob_meta.transpose()

    # total number of predictions
    n_total = df_prob_meta_t.shape[0]

    # create new pandas series with total number of models
    prob_meta = []

    for i in range(11):
        prob = []
        # calculate the confidence interval for each algorithm
        for n in range(n_total):
                prob.append(df_prob_meta_t.iloc[n, i])
        prob_average = np.mean(prob).round(2)
        # add prob_average to prob_meta
        prob_meta.append(prob_average)

    # add prob_meta to df_model_meta as new column "average_probability"
    df_model_meta['average_probability'] = pd.Series(prob_meta)

    df_model_meta['rank'] = df_model_meta['overall_performance'] * df_model_meta['average_probability'] / 10000
    # df_model_meta sort by overall performance and average probability in descending order
    df_model_meta = df_model_meta.sort_values(by=['rank'], ascending=False)
    # sort values in df_prob_meta by df_temp index
    df_prob_meta = df_prob_meta.reindex(df_model_meta.index)
    # sort values in df_pred_meta by df_temp index
    df_pred_meta = df_pred_meta.reindex(df_model_meta.index)
    # sort values in algo by df_temp index
    algo_meta = [algo[i] for i in df_model_meta.index].copy()
    # sort values in algo_names by df_temp index
    algo_names_meta = [algo_names[i] for i in df_model_meta.index].copy()


    # reset indexes for all dataframes
    df_model_meta = df_model_meta.reset_index(drop=True)
    df_prob_meta = df_prob_meta.reset_index(drop=True)
    df_pred_meta = df_pred_meta.reset_index(drop=True)

    for x in range(11):
        df_model_meta.model_id[x] = f'meta_{x+1}'

    df_model_dict_meta[f'{key_model}_meta'] = df_model_meta
    df_prob_dict_meta[f'{key_prob}_meta'] = df_prob_meta
    df_pred_dict_meta[f'{key_pred}_meta'] = df_pred_meta
    algo_dict[f'{key_algo}'] = algo_meta
    algo_names_dict[f'{key_name_algo}'] = algo_names_meta
    

df_model_all
(55, 16)
df_model_outliers
(20, 16)
df_model_cluster_0
(3, 16)
df_model_cluster_1
(10, 16)
df_model_cluster_2
(12, 16)
df_model_cluster_3
(10, 16)


In [16]:
for key in df_model_dict.keys():
    # print number of models
    print(f'{key} has {df_model_dict[key].shape[0]} models')
    

df_model_all has 55 models
df_model_outliers has 20 models
df_model_cluster_0 has 3 models
df_model_cluster_1 has 10 models
df_model_cluster_2 has 12 models
df_model_cluster_3 has 10 models


In [17]:
# create empty dataframe
df_top_rows = pd.DataFrame(columns=df_model_dict_meta['df_model_all_meta'].columns)
df_top_rows['cluster'] = None

for key in df_model_dict_meta.keys():

    string = key.replace('df_model_', '')
    string = string.replace('_meta', '')

    # return row from f_model_dict_meta[key] with highest rank
    highest_rank = df_model_dict_meta[key].sort_values(by=['rank'], ascending=False)['rank'].iloc[0]
    rank_df = df_model_dict_meta[key][df_model_dict_meta[key]['rank'] == highest_rank]

    key_base = key.replace('_meta', '')

    rank_df['cluster'] = f'{string} ({df_model_dict[key_base].shape[0]} models)'

    df_top_rows = pd.concat([df_top_rows, rank_df], axis=0)

# sort by rank
df_top_rows = df_top_rows.sort_values(by=['rank'], ascending=False)
# reset index
df_top_rows = df_top_rows.reset_index(drop=True)
# add 'models' if cluster is 'all'
df_top_rows['cluster'] = df_top_rows['cluster'].apply(lambda x: x + '_models' if x == 'all' else x)

df_top_rows

Unnamed: 0,model_id,algorithm_id,algorithm_name,accuracy,precision,recall,roc_auc_score,geometric_mean_score,matthews_corrcoef,f1_weighted,log_loss,overall_performance,hyperparameters,labels,average_probability,rank,cluster
0,meta_1,3,Gaussian Naive Bayes,86.89,86.89,86.89,86.8,86.8,73.59,86.89,452.97,84.96,"{'priors': None, 'var_smoothing': 0.0}",meta,86.75,0.737028,all (55 models)
1,meta_1,3,Gaussian Naive Bayes,86.89,86.89,86.89,86.8,86.8,73.59,86.89,452.97,84.96,"{'priors': None, 'var_smoothing': 0.0}",meta,85.0,0.72216,cluster_0 (3 models)
2,meta_1,3,Gaussian Naive Bayes,85.25,85.48,85.25,84.74,84.74,70.4,85.15,509.59,83.0,"{'priors': None, 'var_smoothing': 0.0}",meta,85.08,0.706164,cluster_2 (12 models)
3,meta_1,3,Gaussian Naive Bayes,83.61,83.66,83.61,83.23,83.22,66.96,83.54,566.21,81.12,"{'priors': None, 'var_smoothing': 0.0}",meta,84.31,0.683923,outliers (20 models)
4,meta_1,3,Gaussian Naive Bayes,81.97,82.14,81.97,81.44,81.44,63.71,81.85,622.84,79.22,"{'priors': None, 'var_smoothing': 0.0}",meta,82.57,0.65412,cluster_3 (10 models)
5,meta_1,3,Gaussian Naive Bayes,81.97,81.95,81.97,81.71,81.71,63.63,81.94,622.84,79.27,"{'priors': None, 'var_smoothing': 0.0}",meta,81.99,0.649935,cluster_1 (10 models)


In [18]:
fig = go.Figure()

# https://plotly.com/python/horizontal-bar-charts/

metrics = ['accuracy', 'precision', 'recall', 'roc_auc_score', 'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted', 'average_probability']
metrics_legend = ['Accuracy', 'Precision', 'Recall', 'ROC AUC', 'Geometric Mean', 'Matthews CorrCoeff', 'F1 Score', 'Confidence']
colors = ['#c6dfcd', '#f5f2d3', '#e6d5c3', '#c9a8a0', '#737d89', '#869f9f', '#a3a0b8', '#a7bed3']

# add cluster all row
for i in range (8):
    value = df_top_rows[f'{metrics[i]}']
    if metrics[i] != 'average_probability':
        fig.add_trace(go.Bar(y=df_top_rows.cluster, x=value, name=metrics_legend[i], orientation='h', 
                            text= np.round(value/100, 4), marker=dict(color=colors[i], line=dict(color=colors[i], width=3))))
    else:
        fig.add_trace(go.Bar(y=df_top_rows.cluster, x=value*7, name=metrics_legend[i], orientation='h', 
                            text= np.round(value/100, 4), marker=dict(color=colors[i], line=dict(color=colors[i], width=3)))) 


# update x values size
fig.update_layout(yaxis_tickfont_size=12, xaxis_tickfont_size=12, title_text='Model Performance', title_x=0.5)

fig.update_layout(barmode='stack', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
    margin=dict(l=0, r=0, t=0, b=0), showlegend=True, yaxis=dict(categoryorder = 'total ascending'))


# add title for x axes and y axes
fig.update_xaxes(title_text='Overall Performance', title_font=dict(size=14))
fig.update_yaxes(title_text='Cluster', title_font=dict(size=14))

# update legend names
fig.update_layout(legend_title_text='Metrics')
fig.update_traces(textposition='inside')
fig.update_traces(insidetextanchor='middle')
# add % to text
fig.update_traces(texttemplate='%{text:.2%}')

# get total length of bars
total_length = 0
for i in fig.data:
    total_length += i.x[0]

fig.add_annotation(xref='x', yref='y', x=total_length, y=df_top_rows.shape[0], text='Best Performing Metamodel', font = dict(size = 14), showarrow=False)

for i in range (df_top_rows.shape[0]):
    fig.add_annotation(xref='x', yref='y', x=total_length, y=df_top_rows.cluster[i], text=df_top_rows.algorithm_name[i], font = dict(size = 12), showarrow=False)


# update figure size



fig.update_layout(width=1200, height=250)

fig.show()

UMAP function

In [19]:
def umap_model(df, parameter_umap_n_neighbors = 5, parameter_umap_min_dist =  0.5, parameter_umap_metric = 'euclidean'):
        umap_model = umap.UMAP(n_neighbors=parameter_umap_n_neighbors, metric=parameter_umap_metric, min_dist=parameter_umap_min_dist)
        #fit transform and convert to dataframe
        df_umap = pd.DataFrame(umap_model.fit_transform(df), columns=['UMAP_1', 'UMAP_2'])
        # add index to df_umap
        df_umap.index = df.index
        return df_umap

def create_layout_button(n, neighb = 8):
        list = [False] * (neighb*11 + 1)
        list[-1] = True
        if n == 0:
                list[n] = True
                for i in range(1, 11):
                        list[i] = True
        else:
                list[11*n] = True
                for i in range(11*n, 11*n + 11):
                        list[i] = True
        return dict(label = f'number of neighbours {n+3}',
                        method = 'restyle',
                        args = [{'visible': list}])

Function to calculate the model comparison for 2 metamodels and plot it

In [20]:
'''
Function compares 2 metamodels abd plots the overall coverage, using following predefined colors:
White box: both models predict the correct class
Red box: 1st model predicts the correct class, 2nd model predicts the wrong class and combination predicts correct class
Light red box: 1st model predicts the correct class, 2nd model predicts the wrong class and combination predicts wrong class
Blue box: 1st model predicts the wrong class, while the 2nd model predicts the correct class and combination predicts correct class
Light blue box: 1st model predicts the wrong class, while the 2nd model predicts the correct class and combination predicts correct class
Black box: both models predict the wrong class
'''
def coverage(df, meta_1, meta_2):

    # save meta_1, meta_2 and target in a new dataframe
    df_temp = df[[f'{meta_1}', f'{meta_2}']].copy()
    # rename columns to meta_1 and meta_2
    df_temp.columns.values[0:2] = [f'meta_{i}' for i in range(1, 3)]
    df_temp['mean'] = np.round((df_temp.meta_1 + df_temp.meta_2) / 2, 2)

    # get square root of count length and round to integer. That is to define the number of rows and columns in the plot
    n = int(np.ceil(np.sqrt(df_temp.shape[0])))

    df_n = pd.DataFrame(np.zeros((n**2 - df_temp.shape[0], 3)))
    # replace all values with nan
    df_n.iloc[:, :] = np.nan

    df_n.columns = ['meta_1', 'meta_2', 'mean']

    # concatinate df_temp and df_n
    df_temp = pd.concat([df_temp, df_n], axis=0)
    # reset index
    df_temp = df_temp.reset_index(drop=True)

    # concate values form meta_1, meta_2 and mean as a list in a new column
    df_temp['combination'] = df_temp[['meta_1', 'meta_2', 'mean']].values.tolist()

    # create new dataframe with  with 8 rows and 8 columns
    df_count = pd.DataFrame(df_temp.combination.values.reshape(n, n))

    fig = go.Figure()
    fig = make_subplots(rows=n, cols=n, vertical_spacing=0.02, horizontal_spacing=0.02, shared_xaxes='all', shared_yaxes='all')

    # define subplot size
    fig.update_layout(autosize=False, margin={'l': 0, 'r': 0, 't': 0, 'b': 50}, width=500, height=500)

    # Set axes ranges
    fig.update_xaxes(range=[0, 1])
    fig.update_yaxes(range=[0, 1])

    for i in range(n):
        for j in range(n):
            fig.add_shape(type='circle', x0=0, y0=0, x1=1, y1=1, line=dict(width=2), row=i+1, col=j+1)
            fig.update_shapes(
                fillcolor='#ffffff' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] >= 50) # white
                else '#df9797' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] < 50 and  df_count.iloc[i, j][2] >= 50) # light red
                else '#cd5c5c' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] < 50 and  df_count.iloc[i, j][2] < 50) # dark red
                else '#91bad6' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] >= 50 and  df_count.iloc[i, j][2] >= 50) # light blue
                else '#2e5984' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] >= 50 and  df_count.iloc[i, j][2] < 50) # dark blue
                else '#ffd700' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] < 50) # yellow
                else '#ffffff', row=i+1, col=j+1) # white
            fig.update_shapes(
                line=dict(color='#675c57' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] >= 50)
                else '#df9797' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] < 50 and  df_count.iloc[i, j][2] >= 50)
                else '#cd5c5c' if (df_count.iloc[i, j][0] >= 50 and df_count.iloc[i, j][1] < 50 and  df_count.iloc[i, j][2] < 50)
                else '#91bad6' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] >= 50 and  df_count.iloc[i, j][2] >= 50) 
                else '#2e5984' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] >= 50 and  df_count.iloc[i, j][2] < 50)
                else '#ffd700' if (df_count.iloc[i, j][0] < 50 and df_count.iloc[i, j][1] < 50) 
                else '#ffffff'), row=i+1, col=j+1)

    # remove background color
    fig.update_layout(plot_bgcolor='white')

    # remove legend and x axes labels
    fig.update_layout(showlegend=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
        
    return fig

In [21]:
min_performance_all = []
max_performance_all = []

for key_model in df_model_dict:
    # retrun min axn max overall perfromance
    min = df_model_dict[key_model]['overall_performance'].min()
    max = df_model_dict[key_model]['overall_performance'].max()
    # add to list
    min_performance_all.append(min)
    max_performance_all.append(max)
# convert to numpy array
min_performance_all = np.array(min_performance_all)
max_performance_all = np.array(max_performance_all)
# return min value from min performance list and max value from max performance list
min_performance = min_performance_all.min()
# round down min value to closest ten
min_performance= math.floor(min_performance/10)*10
max_performance = max_performance_all.max()
# round up max value to closest ten
max_performance= math.ceil(max_performance/10)*10

In [89]:

def plottingUMAP(df_model, df_model_meta, df_prob, df_prob_meta):
    # concatinate df_model and df_model_meta
    df_model_all = pd.concat([df_model, df_model_meta], axis=0)
    df_model_all = df_model_all.astype({'model_id': 'str', 'algorithm_id': 'int64', 'accuracy': 'float64', 'precision': 'float64', 
                                'recall': 'float64', 'roc_auc_score': 'float64', 'geometric_mean_score': 'float64', 
                                'matthews_corrcoef': 'float64', 'f1_weighted': 'float64', 'log_loss': 'float64', 
                                'overall_performance': 'float64', 'average_probability': 'float64'})
    # scale avarage probability in scale 0 to 1
    df_model_all['average_probability_norm'] = np.round(minmax_scale(df_model_all['average_probability'], feature_range=(0.2, 1)), 1)

    # create new column "size", set to 2 for rows with "meta" in "model_id", else 1
    df_model_all['size'] = np.where(df_model_all['model_id'].str.contains('meta'), 2, 1)
    # create new column for text of points
    df_model_all['text'] = df_model_all['algorithm_name'] + '<br>' + 'Performance: ' + df_model_all['overall_performance'].astype(str) + '%' + '<br>' + 'Model ID: ' + df_model_all['model_id'].astype(str) + '<br>' + 'Accuracy: ' + df_model_all['accuracy'].astype(str) + '%' + '<br>' + 'Precision: ' + df_model_all['precision'].astype(str) + '%' + '<br>' + 'Recall: ' + df_model_all['recall'].astype(str) + '%' + '<br>' + 'ROC AUC: ' + df_model_all['roc_auc_score'].astype(str) + '<br>' + 'Geometric Mean: ' + df_model_all['geometric_mean_score'].astype(str) + '<br>' + 'Matthews Correlation: ' + df_model_all['matthews_corrcoef'].astype(str) + '<br>' + 'F1: ' + df_model_all['f1_weighted'].astype(str) + '<br>' + 'Average Probability: ' + df_model_all['average_probability'].astype(str)
    # add metamodel  to text for metamodels
    df_model_all['text'] = np.where(df_model_all['model_id'].str.contains('meta'), 'MetaModel' + '<br>' + df_model_all['text'], 'Base Model' + '<br>' + df_model_all['text'])

    # set df_prob_meta columns as df_prob columns
    df_prob_meta.columns = df_prob.columns

    # concatinate df_prob and df_prob_meta
    df_prob_all = pd.concat([df_prob, df_prob_meta], axis=0)

    ######################################################################################################################

    ### UMAP dimension reduction algorithm

    # Add umap_prob to df_model_all and rename columns to 'UMAP_1_prob' and 'UMAP_2_prob' for different hyperparameter of number of neighbors

    # define number of neighbors
    neighb = 4

    for i in range(3, 3 + neighb):
        umap_prob = umap_model(df_prob_all, parameter_umap_n_neighbors = i)
        df_model_all = pd.concat([df_model_all, umap_prob], axis=1)
        df_model_all.rename(columns={'UMAP_1': f'UMAP_1_prob_{i}', 'UMAP_2': f'UMAP_2_prob_{i}'}, inplace=True)

    fig = go.Figure()

    # define subplot size
    fig.update_layout(width=1000, height=800)

    # convert symbols to dictionary with keys from 1 to 11
    symbols_dict = dict(zip(range(1, 12), symbols))

    # Plot UMAP, add hovertext and symbols, define colorscale by performance, add title
    for i in range(3, 3 + neighb):
        for key in algos.keys():
            df_model_red = df_model_all[df_model_all['algorithm_id'] == key]
            fig.add_trace(go.Scatter(x=df_model_red[f'UMAP_1_prob_{i}'], y=df_model_red[f'UMAP_2_prob_{i}'], mode='markers', hovertext=df_model_red['text'], 
                        marker=dict(size=df_model_red['size']*20, symbol = df_model_red['algorithm_id'].map(symbols_dict),
                        opacity = df_model_red['average_probability_norm'], line=dict(width=df_model_red['size'], color='Black'),
                        color=df_model_red['overall_performance'], coloraxis='coloraxis'), name = algos[key]))

    # add new trace with algorithm name and corresponfing symbol, place it inside the plot
    for key in algos.keys():
        fig.add_trace(go.Scatter(x=[0], y=[0], mode='markers', marker=dict(size=0, symbol = key, opacity = 0, line=dict(width=0, color='Black'), color='Black'), name = algos[key]))

    # show symbols for each algorithm in the legend

    
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', margin=dict(l=0, r=0, t=0, b=0))

    # remove axes labels
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)

    fig.update_traces(selector=dict(mode='markers'))
    # define symbols for markers based on symbols defined
    fig.update_layout(hoverlabel=dict(bgcolor="white", font_size=14, font_family="Rockwell"), hovermode='closest')
    # add tooltip
    fig.update_layout(coloraxis=dict(colorscale='Viridis'), showlegend=False)
    # add title to colorbar
    fig.update_layout(coloraxis_colorbar=dict(title='Metric-Based Performance', titleside='right'))

    min_perf = math.floor(df_model_all['overall_performance'].min()/10)*10
    max_perf = math.ceil(df_model_all['overall_performance'].max()/10)*10

    # set min and max value for legend as min and max value of overall_performance
    fig.update_layout(coloraxis=dict(cmin=min_perf, cmax=max_perf))

    return fig

In [90]:
fig = plottingUMAP(df_model_dict['df_model_cluster_1'], df_model_dict_meta['df_model_cluster_1_meta'], df_prob_dict['df_prob_cluster_1'], df_prob_dict_meta['df_prob_cluster_1_meta'])
fig.update_layout(width=1000, height=500)
# add title to the figure
fig.update_layout(title_text=f'{key_model}')

# save fig to json file
fig.write_json('test_umap.json')


In [91]:
import plotly
# read figure from json file
fig = go.Figure(plotly.io.read_json('test_umap.json'))

fig.show()

In [23]:
for key_model, key_model_meta, key_prob, key_prob_meta in zip(df_model_dict, df_model_dict_meta, df_prob_dict, df_prob_dict_meta):
    fig = plottingUMAP(df_model_dict[key_model], df_model_dict_meta[key_model_meta], df_prob_dict[key_prob], df_prob_dict_meta[key_prob_meta])
    fig.update_layout(width=1000, height=800)
    # add title to the figure
    fig.update_layout(title_text=f'{key_model}')
    fig.show()

### Scatter plot matrix with Correlation

Probability average (deviation difference) for model probabilities for all models – confidence levels <br>
Following approach presented: <br>
- get probabilities for all 11 metamodels
- then comparing 2 metamodels, find the one with better probability of correct class for each instance
- add this value as top probability for each instance and calculate the mean value for all instances for both models

We will operate further only with prediction results, which are not the same for the all 11 meta models; the main idea is to investigate the differencies and combinations of different metamodels. That can be cosidered as feature engineering to df_pred_meta dataframe (dataframe with all 11 meta models predictions per instance) to keep only the columns, which are not the same for all 11 meta models.

In [24]:
def plotting_comparison(df_model_meta, df_prob_meta, algo):

    algo_cap = [i.upper() for i in algo].copy()


    df_prob_meta_t = df_prob_meta.transpose()
    # rename columns to correspond to algorithm names based on perfromance metrics
    df_prob_meta_t.columns = [i for i in df_model_meta.model_id.unique()]

    # total number of predictions
    n_total = df_prob_meta_t.shape[0]

    # create empty list
    prob = []

    df_prob_meta_cor = pd.DataFrame(index=df_prob_meta_t.columns, columns=df_prob_meta_t.columns)

    for i in range(11):
        for j in range(11):
            if i != j:
                prob = []
                # calculate the probability of two models predicting the correct result
                for n in range(n_total):
                    if df_prob_meta_t.iloc[n, i] > df_prob_meta_t.iloc[n, j]:
                        # append df_prob_meta_t.iloc[n, i] to prob
                        prob.append(df_prob_meta_t.iloc[n, i])
                    else:
                        prob.append(df_prob_meta_t.iloc[n, j])
                prob_average = np.mean(prob).round(2)
                # add 2 models average probability to df_prob_meta_cor
                df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] = prob_average

    df_prob_meta_red = df_prob_meta_t[df_prob_meta_t.apply(lambda x: x.min() < 50, axis=1)]
    # reset index for df_pred_meta_red and df_prob_meta_red
    df_prob_meta_red.reset_index(drop=True, inplace=True)
    # add new column to store the average probability for each instance
    df_prob_meta_red['average_probability'] = round(df_prob_meta_red.iloc[:,1:].mean(axis=1), 2) 
    # sort by average_probability_norm
    df_prob_meta_red.sort_values(by='average_probability', ascending=False, inplace=True)
    # drop avergae_probability_norm column
    df_prob_meta_red.drop(columns=['average_probability'], inplace=True)

    # create new dataframe with rows and columns like columns in df_pred_meta_t
    df_pred_meta_cor = pd.DataFrame(index=df_prob_meta_t.columns, columns=df_prob_meta_t.columns)
  
    for i in range(11):
        for j in range(11):
            if i != j:
                # calculate how good two models can contribute to the prediction result
                result = df_prob_meta_t.apply(lambda x: (x[f'meta_{i+1}'] >= 50) or (x[f'meta_{j+1}'] >= 50), axis=1)
                # caclucate the number of wrong predictions for both models
                n_wrong = result[result == False].shape[0]
                # calculate the percentage of wrong predictions in total number of predictions
                perc_correct = round((n_total - n_wrong) / n_total, 4) *100
                # save the percentage in the dataframe
                df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] = perc_correct
    
    # As Plotly does not support the subplot plotting inside another subplot, we have to save each subplot as a 
    # picture and then load it in the overall plot.

    # create subfolder pictures under path
    path_pictures_prob = path + './pictures/prob/'
    if not os.path.exists(path_pictures_prob):
        os.makedirs(path_pictures_prob)
    else:
        shutil.rmtree(path_pictures_prob)
        os.makedirs(path_pictures_prob)

    for i in range(1, 12):
        for j in range(1, 12):
            if i > j:
                meta_1 = f'meta_{i}'
                meta_2 = f'meta_{j}'
                fig = coverage(df_prob_meta_red, meta_1, meta_2)
                # save figure
                # save fig to json file
                fig.write_json(file = path_pictures_prob + f'prob_meta_{i}_meta_{j}.json')
                # fig.write_image(file = path_pictures_prob + f'prob_meta_{i}_meta_{j}.webp')

    fig = go.Figure()
    fig = make_subplots(rows=12, cols=12, vertical_spacing=0.01, horizontal_spacing=0.01)

    # define subplot size
    fig.update_layout(width=width, height=height)

    df_model_meta_metr = df_model_meta[['accuracy', 'precision', 'recall', 'roc_auc_score', 'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted']]

    # define colors for average probabilities
    df_model_meta_color = pd.DataFrame()
    df_model_meta_color['prob'] = df_model_meta.average_probability
    # standarize df_test.prob from 0 to 1
    df_model_meta_color['prob_norm'] = minmax_scale(df_model_meta_color['prob'], feature_range=(0,1))
    # convert df_test.prob_norm to colors
    df_model_meta_color['color'] = df_model_meta_color['prob_norm'] * (-225) +225
    # convert to int
    df_model_meta_color['color'] = df_model_meta_color['color'].astype(int)
    # convert to hex
    df_model_meta_color['color_hex'] = df_model_meta_color['color'].apply(lambda x: '#%02x%02x%02x' % (x, x, x))

    ############################################################

    # add subplots for algorithm contribution to end result

    for i in range(11):
        for j in range(11):
            if i < j:

                if df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] > df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']:
                    color='#9970ab'
                else:
                    color='#2ca25f'

                fig.add_trace(go.Indicator(
                    mode = 'gauge+number+delta',
                    value =  df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100,
                    # add percentage sign to number and decimals to number
                    number = {'valueformat':'.1%', 'font': {'size': 9}},
                    delta = {'reference': (df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] - df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'])/100, 
                            'increasing': {'color': color, 'symbol': ''}, 'decreasing': {'color': color, 'symbol': ''}, 'font': {'size': 9},
                            'relative': False, 'valueformat':'.1%'},
                    gauge = {
                        'axis': {'range': [0.5, 1], 'tickwidth': 1, 'tickfont': {'size': 6}, 'ticklen' : 1, 'tickvals': [0.6, 0.7, 0.8, 0.9]},
                        'bar': {'color': "#fdbf6f"},
                        'steps' : [{'range': [0.5, df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100], 'color': "gray"}],
                        'threshold' : {'line': {'color': color, 'width': 4}, 
                        'thickness': 1, 
                        'value': df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100}},
                    domain = {'row': i+1, 'column': j+1}))

    fig.update_layout(
        grid = {'rows': 12, 'columns': 12, 'pattern': "independent"})
                            
    ############################################################
    # add rows and columns names
    for i in range(11):
        fig.add_annotation(text=algo_cap[i], showarrow=False, font={"size":20, 'color':'#2E5984'}, row=1, col=i+2)
    for i in range(11):
        fig.add_annotation(text=algo_cap[i], showarrow=False, font={"size":20, 'color':'#cd5c5c'}, row=i+2, col=1)

    ############################################################

    # add subplots for metamodel comparison with color coding
    # define sub_plot size
    img_width = 500
    img_height = 500

    for i in range(11):
        for j in range(11):
            if i > j:
                # img = Image.open(path_pictures_prob + f'prob_meta_{i+1}_meta_{j+1}.webp')

                # # # Add invisible scatter trace.
                # # # This trace is added to help the autoresize logic work.
                # fig.add_trace(go.Scatter(x=[0, img_width], y=[0, img_height], mode="markers", marker_opacity=0), row=i+2, col=j+2)

                # # Add image
                # fig.add_layout_image(dict(source=img, x=0, y = img_height-100, sizex=img_width, sizey=img_height, xref='paper', yref='paper', 
                #                         opacity=1.0), row=i+2, col=j+2)

                fig.add_trace(go.Figure(plotly.io.read_json(path_pictures_prob + f'prob_meta_{i+1}_meta_{j+1}.json'), row=i+2, col=j+2))
                
                fig.update_layout(showlegend=False)
                fig.update_xaxes(showticklabels=False)
                fig.update_yaxes(showticklabels=False)

    ############################################################

    # add subplots for metrics in each algorithm

    # return min value from df_model_meta_metr
    y_min = df_model_meta_metr.min().min()
    # return max value from df_model_meta_metr
    y_max = df_model_meta_metr.max().max()

    # round down min_value to closest ten
    y_min = math.floor(y_min/10)*10
    # round up max_value to closest ten
    y_max = math.ceil(y_max/10)*10
    limit = y_max - y_min

    for i in range(11):

        fig.add_trace(go.Bar(x = df_model_meta_metr.columns, y = df_model_meta_metr.iloc[i], marker_color=df_model_meta_color.color_hex.iloc[i],
                    name = algo_cap[i]), row=i+2, col=i+2)
                    # reduce text size
        fig.add_annotation(text=f'Conf.: {df_model_meta.average_probability[i]}%', showarrow=False,
                    xref="x domain",yref="y domain", yshift =40, 
                    font={"size":10, 'color':'#000000'}, row=i+2, col=i+2)
        # update y axis range
        fig.update_yaxes(range=[y_min, y_max], row=i+2, col=i+2)
        # remove legend and x axes labels
        fig.update_layout(showlegend=False)

        # define ticklabels for x axis with only first letter uppercase and rotate x axis labels
        metrics_capital = df_model_meta_metr.columns.str.capitalize().str[0:1]
        # convert to list
        metrics_capital = metrics_capital.tolist()
        metrics_capital[3] = 'C' # convert to C for ROC_AUC_SCORE
        fig.update_xaxes(tickvals=df_model_meta_metr.columns, ticktext=metrics_capital, tickfont= {"size":10, 'color':'#000000'}, tickangle=0, row=i+2, col=i+2)
        # define ticklabels for y axis with values 70 to 90
        if limit <= 10:
            fig.update_yaxes(tickvals=[y_min], ticktext=[y_min], row=i+2, col=i+2)
        else:
            fig.update_yaxes(tickvals=[y_min, y_max-10], ticktext=[y_min, y_max-10], row=i+2, col=i+2)

    ############################################################

    fig.update_layout(showlegend=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)

    # remove background color
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')


    # show xaxis and yaxis labels
    fig.update_layout(xaxis105_showticklabels=True, xaxis118_showticklabels=True, xaxis131_showticklabels=True, xaxis14_showticklabels=True,
    xaxis144_showticklabels=True, xaxis27_showticklabels=True, xaxis40_showticklabels=True, xaxis53_showticklabels=True, xaxis66_showticklabels=True,
    xaxis79_showticklabels=True, xaxis92_showticklabels=True)

    fig.update_layout(yaxis105_showticklabels=True, yaxis118_showticklabels=True, yaxis131_showticklabels=True, yaxis14_showticklabels=True,
    yaxis144_showticklabels=True, yaxis27_showticklabels=True, yaxis40_showticklabels=True, yaxis53_showticklabels=True, yaxis66_showticklabels=True,
    yaxis79_showticklabels=True, yaxis92_showticklabels=True)

    return fig

In [25]:
df_model_meta, df_prob_meta, algo_meta = df_model_dict_meta['df_model_all_meta'], df_prob_dict_meta['df_prob_all_meta'], algo_dict['algo_all']

algo_cap = [i.upper() for i in algo].copy()


df_prob_meta_t = df_prob_meta.transpose()
# rename columns to correspond to algorithm names based on perfromance metrics
df_prob_meta_t.columns = [i for i in df_model_meta.model_id.unique()]

# total number of predictions
n_total = df_prob_meta_t.shape[0]

# create empty list
prob = []

df_prob_meta_cor = pd.DataFrame(index=df_prob_meta_t.columns, columns=df_prob_meta_t.columns)

for i in range(11):
    for j in range(11):
        if i != j:
            prob = []
            # calculate the probability of two models predicting the correct result
            for n in range(n_total):
                if df_prob_meta_t.iloc[n, i] > df_prob_meta_t.iloc[n, j]:
                    # append df_prob_meta_t.iloc[n, i] to prob
                    prob.append(df_prob_meta_t.iloc[n, i])
                else:
                    prob.append(df_prob_meta_t.iloc[n, j])
            prob_average = np.mean(prob).round(2)
            # add 2 models average probability to df_prob_meta_cor
            df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] = prob_average

df_prob_meta_red = df_prob_meta_t[df_prob_meta_t.apply(lambda x: x.min() < 50, axis=1)]
# reset index for df_pred_meta_red and df_prob_meta_red
df_prob_meta_red.reset_index(drop=True, inplace=True)
# add new column to store the average probability for each instance
df_prob_meta_red['average_probability'] = round(df_prob_meta_red.iloc[:,1:].mean(axis=1), 2) 
# sort by average_probability_norm
df_prob_meta_red.sort_values(by='average_probability', ascending=False, inplace=True)
# drop avergae_probability_norm column
df_prob_meta_red.drop(columns=['average_probability'], inplace=True)

# create new dataframe with rows and columns like columns in df_pred_meta_t
df_pred_meta_cor = pd.DataFrame(index=df_prob_meta_t.columns, columns=df_prob_meta_t.columns)

for i in range(11):
    for j in range(11):
        if i != j:
            # calculate how good two models can contribute to the prediction result
            result = df_prob_meta_t.apply(lambda x: (x[f'meta_{i+1}'] >= 50) or (x[f'meta_{j+1}'] >= 50), axis=1)
            # caclucate the number of wrong predictions for both models
            n_wrong = result[result == False].shape[0]
            # calculate the percentage of wrong predictions in total number of predictions
            perc_correct = round((n_total - n_wrong) / n_total, 4) *100
            # save the percentage in the dataframe
            df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] = perc_correct

# As Plotly does not support the subplot plotting inside another subplot, we have to save each subplot as a 
# picture and then load it in the overall plot.

# create subfolder pictures under path
path_pictures_prob = path + './pictures/prob/'
if not os.path.exists(path_pictures_prob):
    os.makedirs(path_pictures_prob)
else:
    shutil.rmtree(path_pictures_prob)
    os.makedirs(path_pictures_prob)

for i in range(1, 12):
    for j in range(1, 12):
        if i > j:
            meta_1 = f'meta_{i}'
            meta_2 = f'meta_{j}'
            fig = coverage(df_prob_meta_red, meta_1, meta_2)
            # save figure
            # save fig to json file
            fig.write_json(file = path_pictures_prob + f'prob_meta_{i}_meta_{j}.json')
            # fig.write_image(file = path_pictures_prob + f'prob_meta_{i}_meta_{j}.webp')

In [101]:
fig = go.Figure()
fig = make_subplots(rows=12, cols=12, vertical_spacing=0.01, horizontal_spacing=0.01)

df_model_meta_metr = df_model_meta[['accuracy', 'precision', 'recall', 'roc_auc_score', 'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted']]

# define colors for average probabilities
df_model_meta_color = pd.DataFrame()
df_model_meta_color['prob'] = df_model_meta.average_probability
# standarize df_test.prob from 0 to 1
df_model_meta_color['prob_norm'] = minmax_scale(df_model_meta_color['prob'], feature_range=(0,1))
# convert df_test.prob_norm to colors
df_model_meta_color['color'] = df_model_meta_color['prob_norm'] * (-225) +225
# convert to int
df_model_meta_color['color'] = df_model_meta_color['color'].astype(int)
# convert to hex
df_model_meta_color['color_hex'] = df_model_meta_color['color'].apply(lambda x: '#%02x%02x%02x' % (x, x, x))

############################################################

# add subplots for algorithm contribution to end result

for i in range(11):
    for j in range(11):
        if i < j:

            if df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] > df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']:
                color='#9970ab'
            else:
                color='#2ca25f'

            fig.add_trace(go.Indicator(
                mode = 'gauge+number+delta',
                value =  df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100,
                # add percentage sign to number and decimals to number
                number = {'valueformat':'.1%', 'font': {'size': 9}},
                delta = {'reference': (df_pred_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'] - df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}'])/100, 
                        'increasing': {'color': color, 'symbol': ''}, 'decreasing': {'color': color, 'symbol': ''}, 'font': {'size': 9},
                        'relative': False, 'valueformat':'.1%'},
                gauge = {
                    'axis': {'range': [0.5, 1], 'tickwidth': 1, 'tickfont': {'size': 6}, 'ticklen' : 1, 'tickvals': [0.6, 0.7, 0.8, 0.9]},
                    'bar': {'color': "#fdbf6f"},
                    'steps' : [{'range': [0.5, df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100], 'color': "gray"}],
                    'threshold' : {'line': {'color': color, 'width': 4}, 
                    'thickness': 1, 
                    'value': df_prob_meta_cor.loc[f'meta_{i+1}', f'meta_{j+1}']/100}},
                domain = {'row': i+1, 'column': j+1}))

fig.update_layout(
    grid = {'rows': 12, 'columns': 12, 'pattern': "independent"})
                        
############################################################
# add rows and columns names
for i in range(11):
    fig.add_annotation(text=algo_cap[i], showarrow=False, font={"size":20, 'color':'#2E5984'}, row=1, col=i+2)
for i in range(11):
    fig.add_annotation(text=algo_cap[i], showarrow=False, font={"size":20, 'color':'#cd5c5c'}, row=i+2, col=1)

############################################################

# add subplots for metamodel comparison with color coding
# define sub_plot size
img_width = 500
img_height = 500

for i in range(11):
    for j in range(11):
        if i > j:
            # img = Image.open(path_pictures_prob + f'prob_meta_{i+1}_meta_{j+1}.webp')

            # # # Add invisible scatter trace.
            # # # This trace is added to help the autoresize logic work.
            # fig.add_trace(go.Scatter(x=[0, img_width], y=[0, img_height], mode="markers", marker_opacity=0), row=i+2, col=j+2)

            # # Add image
            # fig.add_layout_image(dict(source=img, x=0, y = img_height-100, sizex=img_width, sizey=img_height, xref='paper', yref='paper', 
            #                         opacity=1.0), row=i+2, col=j+2)

            fig.add_trace(go.Figure(plotly.io.read_json(path_pictures_prob + f'prob_meta_{i+1}_meta_{j+1}.json')), row=i+2, col=j+2)
            
            fig.update_layout(showlegend=False)
            fig.update_xaxes(showticklabels=False)
            fig.update_yaxes(showticklabels=False)

############################################################

# add subplots for metrics in each algorithm

# return min value from df_model_meta_metr
y_min = df_model_meta_metr.min().min()
# return max value from df_model_meta_metr
y_max = df_model_meta_metr.max().max()

# round down min_value to closest ten
y_min = math.floor(y_min/10)*10
# round up max_value to closest ten
y_max = math.ceil(y_max/10)*10
limit = y_max - y_min

for i in range(11):

    fig.add_trace(go.Bar(x = df_model_meta_metr.columns, y = df_model_meta_metr.iloc[i], marker_color=df_model_meta_color.color_hex.iloc[i],
                name = algo_cap[i]), row=i+2, col=i+2)
                # reduce text size
    fig.add_annotation(text=f'Conf.: {df_model_meta.average_probability[i]}%', showarrow=False,
                xref="x domain",yref="y domain", yshift =40, 
                font={"size":10, 'color':'#000000'}, row=i+2, col=i+2)
    # update y axis range
    fig.update_yaxes(range=[y_min, y_max], row=i+2, col=i+2)
    # remove legend and x axes labels
    fig.update_layout(showlegend=False)

    # define ticklabels for x axis with only first letter uppercase and rotate x axis labels
    metrics_capital = df_model_meta_metr.columns.str.capitalize().str[0:1]
    # convert to list
    metrics_capital = metrics_capital.tolist()
    metrics_capital[3] = 'C' # convert to C for ROC_AUC_SCORE
    fig.update_xaxes(tickvals=df_model_meta_metr.columns, ticktext=metrics_capital, tickfont= {"size":10, 'color':'#000000'}, tickangle=0, row=i+2, col=i+2)
    # define ticklabels for y axis with values 70 to 90
    if limit <= 10:
        fig.update_yaxes(tickvals=[y_min], ticktext=[y_min], row=i+2, col=i+2)
    else:
        fig.update_yaxes(tickvals=[y_min, y_max-10], ticktext=[y_min, y_max-10], row=i+2, col=i+2)

############################################################

fig.update_layout(showlegend=False)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

# remove background color
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')


# show xaxis and yaxis labels
fig.update_layout(xaxis105_showticklabels=True, xaxis118_showticklabels=True, xaxis131_showticklabels=True, xaxis14_showticklabels=True,
xaxis144_showticklabels=True, xaxis27_showticklabels=True, xaxis40_showticklabels=True, xaxis53_showticklabels=True, xaxis66_showticklabels=True,
xaxis79_showticklabels=True, xaxis92_showticklabels=True)

fig.update_layout(yaxis105_showticklabels=True, yaxis118_showticklabels=True, yaxis131_showticklabels=True, yaxis14_showticklabels=True,
yaxis144_showticklabels=True, yaxis27_showticklabels=True, yaxis40_showticklabels=True, yaxis53_showticklabels=True, yaxis66_showticklabels=True,
yaxis79_showticklabels=True, yaxis92_showticklabels=True)

ValueError: 
    Invalid element(s) received for the 'data' property of 
        Invalid elements include: [Figure({
    'data': [],
    'layout': {'autosize': False,
               'height': 500,
               'margin': {'b': 50, 'l': 0, 'r': 0, 't': 0},
               'plot_bgcolor': 'white',
               'shapes': [{'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x2',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y2'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x3',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y3'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x4',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y4'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x5',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y5'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x6',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y6'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x7',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y7'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x8',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y8'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x9',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y9'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x10',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y10'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#675c57', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x11',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y11'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x12',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y12'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x13',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y13'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x14',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y14'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x15',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y15'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x16',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y16'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x17',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y17'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x18',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y18'},
                          {'fillcolor': '#ffd700',
                           'line': {'color': '#ffd700', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x19',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y19'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x20',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y20'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x21',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y21'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x22',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y22'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x23',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y23'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x24',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y24'},
                          {'fillcolor': '#ffffff',
                           'line': {'color': '#ffffff', 'width': 2},
                           'type': 'circle',
                           'x0': 0,
                           'x1': 1,
                           'xref': 'x25',
                           'y0': 0,
                           'y1': 1,
                           'yref': 'y25'}],
               'showlegend': False,
               'template': '...',
               'width': 500,
               'xaxis': {'anchor': 'y',
                         'domain': [0.0, 0.184],
                         'matches': 'x21',
                         'range': [0, 1],
                         'showticklabels': False},
               'xaxis10': {'anchor': 'y10',
                           'domain': [0.816, 1.0],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis11': {'anchor': 'y11',
                           'domain': [0.0, 0.184],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis12': {'anchor': 'y12',
                           'domain': [0.204, 0.388],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis13': {'anchor': 'y13',
                           'domain': [0.408, 0.592],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis14': {'anchor': 'y14',
                           'domain': [0.6120000000000001, 0.796],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis15': {'anchor': 'y15',
                           'domain': [0.816, 1.0],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis16': {'anchor': 'y16',
                           'domain': [0.0, 0.184],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis17': {'anchor': 'y17',
                           'domain': [0.204, 0.388],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis18': {'anchor': 'y18',
                           'domain': [0.408, 0.592],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis19': {'anchor': 'y19',
                           'domain': [0.6120000000000001, 0.796],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis2': {'anchor': 'y2',
                          'domain': [0.204, 0.388],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis20': {'anchor': 'y20',
                           'domain': [0.816, 1.0],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis21': {'anchor': 'y21', 'domain': [0.0, 0.184], 'range': [0, 1], 'showticklabels': False},
               'xaxis22': {'anchor': 'y22',
                           'domain': [0.204, 0.388],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis23': {'anchor': 'y23',
                           'domain': [0.408, 0.592],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis24': {'anchor': 'y24',
                           'domain': [0.6120000000000001, 0.796],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis25': {'anchor': 'y25',
                           'domain': [0.816, 1.0],
                           'matches': 'x21',
                           'range': [0, 1],
                           'showticklabels': False},
               'xaxis3': {'anchor': 'y3',
                          'domain': [0.408, 0.592],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis4': {'anchor': 'y4',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis5': {'anchor': 'y5',
                          'domain': [0.816, 1.0],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis6': {'anchor': 'y6',
                          'domain': [0.0, 0.184],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis7': {'anchor': 'y7',
                          'domain': [0.204, 0.388],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis8': {'anchor': 'y8',
                          'domain': [0.408, 0.592],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'xaxis9': {'anchor': 'y9',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'x21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis': {'anchor': 'x',
                         'domain': [0.816, 1.0],
                         'matches': 'y21',
                         'range': [0, 1],
                         'showticklabels': False},
               'yaxis10': {'anchor': 'x10',
                           'domain': [0.6120000000000001, 0.796],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis11': {'anchor': 'x11',
                           'domain': [0.408, 0.592],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis12': {'anchor': 'x12',
                           'domain': [0.408, 0.592],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis13': {'anchor': 'x13',
                           'domain': [0.408, 0.592],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis14': {'anchor': 'x14',
                           'domain': [0.408, 0.592],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis15': {'anchor': 'x15',
                           'domain': [0.408, 0.592],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis16': {'anchor': 'x16',
                           'domain': [0.204, 0.388],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis17': {'anchor': 'x17',
                           'domain': [0.204, 0.388],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis18': {'anchor': 'x18',
                           'domain': [0.204, 0.388],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis19': {'anchor': 'x19',
                           'domain': [0.204, 0.388],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis2': {'anchor': 'x2',
                          'domain': [0.816, 1.0],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis20': {'anchor': 'x20',
                           'domain': [0.204, 0.388],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis21': {'anchor': 'x21', 'domain': [0.0, 0.184], 'range': [0, 1], 'showticklabels': False},
               'yaxis22': {'anchor': 'x22',
                           'domain': [0.0, 0.184],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis23': {'anchor': 'x23',
                           'domain': [0.0, 0.184],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis24': {'anchor': 'x24',
                           'domain': [0.0, 0.184],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis25': {'anchor': 'x25',
                           'domain': [0.0, 0.184],
                           'matches': 'y21',
                           'range': [0, 1],
                           'showticklabels': False},
               'yaxis3': {'anchor': 'x3',
                          'domain': [0.816, 1.0],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis4': {'anchor': 'x4',
                          'domain': [0.816, 1.0],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis5': {'anchor': 'x5',
                          'domain': [0.816, 1.0],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis6': {'anchor': 'x6',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis7': {'anchor': 'x7',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis8': {'anchor': 'x8',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False},
               'yaxis9': {'anchor': 'x9',
                          'domain': [0.6120000000000001, 0.796],
                          'matches': 'y21',
                          'range': [0, 1],
                          'showticklabels': False}}
})]

    The 'data' property is a tuple of trace instances
    that may be specified as:
      - A list or tuple of trace instances
        (e.g. [Scatter(...), Bar(...)])
      - A single trace instance
        (e.g. Scatter(...), Bar(...), etc.)
      - A list or tuple of dicts of string/value properties where:
        - The 'type' property specifies the trace type
            One of: ['bar', 'barpolar', 'box', 'candlestick',
                     'carpet', 'choropleth', 'choroplethmapbox',
                     'cone', 'contour', 'contourcarpet',
                     'densitymapbox', 'funnel', 'funnelarea',
                     'heatmap', 'heatmapgl', 'histogram',
                     'histogram2d', 'histogram2dcontour', 'icicle',
                     'image', 'indicator', 'isosurface', 'mesh3d',
                     'ohlc', 'parcats', 'parcoords', 'pie',
                     'pointcloud', 'sankey', 'scatter',
                     'scatter3d', 'scattercarpet', 'scattergeo',
                     'scattergl', 'scattermapbox', 'scatterpolar',
                     'scatterpolargl', 'scattersmith',
                     'scatterternary', 'splom', 'streamtube',
                     'sunburst', 'surface', 'table', 'treemap',
                     'violin', 'volume', 'waterfall']

        - All remaining properties are passed to the constructor of
          the specified trace type

        (e.g. [{'type': 'scatter', ...}, {'type': 'bar, ...}])

In [102]:
for key_model_meta, key_prob_meta, key_dict in zip(df_model_dict_meta, df_prob_dict_meta, algo_dict):
    fig = plotting_comparison(df_model_dict_meta[key_model_meta], df_prob_dict_meta[key_prob_meta], algo_dict[key_dict])
    # add title to the figure
    fig.update_layout(title_text=f'{key_model_meta}')
    fig.show()


TypeError: 'numpy.float64' object is not callable