### Initial Data exploration and wrangling

##### Modul import

In [20]:
import pandas as pd
import umap
from sklearn import preprocessing
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import ast
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, f1_score, log_loss
from sklearn import metrics
from imblearn.metrics import geometric_mean_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# define random seed
np.random.seed(42)


#### Dataset import and wrangling

In [21]:
path = r'D:\github\2dv50e\Data\1. Heart Disease'
df_prob = pd.read_csv(path + r'\topModelsProbabilities.csv')
df_mod = pd.read_csv(path + r'\topModels.csv')
# add column target to df model from target.csv
df_target = pd.read_csv(path + r'\target.csv')
y_train = df_target['class']
target = df_target['class'].tolist()
df_source = pd.read_csv(path + r'\dataset.csv')

algo_nr = df_mod['algorithm_id']

In [22]:
# Print number of missing values per column in df_probabilities
for col in df_prob.columns:
    if df_prob[col].isnull().sum() > 0:
        print(f'{col} has {df_prob[col].isnull().sum()} missing values.')

In [23]:
# Apply scaler to df_source
scaler = preprocessing.StandardScaler()
df_source_scaled = pd.DataFrame(scaler.fit_transform(df_source), columns=df_source.columns)
x_train = df_source_scaled.copy()

In [24]:
algos = {1:'K-Nearest Neighbor', 2:'Support Vector Machine', 3:'Gaussian Naive Bayes', 4:'Multilayer Perceptron', 5:'Logistic Regression',
        6:'Linear Discriminant Analysis', 7:'Quadratic Discriminant Analysis', 8:'Random Forest', 9:'Extra Trees', 10:'Adaptive Boosting',
        11:'Gradient Boosting'}

In [25]:
df_model = df_mod.copy()
# rename columns in df_model
df_model.rename(columns={'params': 'hyperparameters', 'mean_test_accuracy': 'accuracy', 'mean_test_precision_weighted': 'precision', 
                        'mean_test_recall_weighted': 'recall', 'mean_test_roc_auc_ovo_weighted': 'roc_auc_score',
                        'geometric_mean_score_weighted': 'geometric_mean_score'}, inplace=True)
# remove overall perfromance column
df_model.drop(columns=['overall_performance'], inplace=True)
df_model['algorithm_name'] = df_model['algorithm_id'].map(algos)
df_model['overall_performance'] = round((df_model['accuracy'] + df_model['precision'] + df_model['recall'] + df_model['roc_auc_score'] + \
                                df_model['geometric_mean_score'] + df_model['matthews_corrcoef'] + df_model['f1_weighted']) / 7, 2)
# Sort columns
df_model = df_model[['model_id', 'algorithm_id', 'algorithm_name', 'accuracy', 'precision', 'recall', 'roc_auc_score',
                        'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted', 'log_loss', 'overall_performance', 'hyperparameters']]



Function to extract hyperparameters from the best performing model per algorithm

In [26]:
def best_params(df):
      # Select hyperparameters for best model in each algorithm
      df_best = df.groupby('algorithm_id').apply(lambda x: x.sort_values('overall_performance', ascending=False).iloc[0])
      # reset algorithm_nr as  index
      df_best = df_best.reset_index(drop=True)
      # keep only algorithm number, name, performance and hyperparameters
      df_best = df_best[['algorithm_id', 'algorithm_name', 'overall_performance', 'hyperparameters']]
      # rename overall_performance as performance
      df_best.rename(columns={'overall_performance': 'performance'}, inplace=True)
      return df_best

Supporting functions to extract hyperparameters values

In [27]:
# Return hyperparameters based on algorithm name
def get_hyperparameters(df, algorithm_name):
    return df[df['algorithm_name'] == algorithm_name]['hyperparameters']

# return value of key in dictionary
def get_value(dictionary, key):
    return dictionary[key]

# convert string to dictionary
def string_to_dict(string):
    return ast.literal_eval(string)


Base layer estimators (top 5 per algorithm)

In [28]:
# dictionaries with algorithm names and their hyperparameters

knn_params = {int(k):v for k,v in get_hyperparameters(df_model, 'K-Nearest Neighbor').items()}
svm_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Support Vector Machine').items()}
gnb_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Gaussian Naive Bayes').items()}
mlp_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Multilayer Perceptron').items()}
lr_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Logistic Regression').items()}
lda_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Linear Discriminant Analysis').items()}
qda_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Quadratic Discriminant Analysis').items()}
rf_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Random Forest').items()}
et_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Extra Trees').items()}
ab_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Adaptive Boosting').items()}
gb_params = {int(k):v for k,v in get_hyperparameters(df_model, 'Gradient Boosting').items()}

In [29]:
# Estimators and hyperparameters for each algorithm
estimators = []
# populate list of estimators with all 55 top models
count = 0
for i in list(knn_params.keys()):
                temp_dict = ast.literal_eval(knn_params[i])
                estimators.append((f'knn_{count}', KNeighborsClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                metric=get_value(temp_dict, 'metric'), n_neighbors= get_value(temp_dict, 'n_neighbors'), 
                                weights=get_value(temp_dict, 'weights'), n_jobs=-1)))
                count += 1
count = 0
for i in list(svm_params.keys()):
                temp_dict = ast.literal_eval(svm_params[i])
                estimators.append((f'svm_{count}', SVC(C=get_value(temp_dict, 'C'), kernel=get_value(temp_dict, 'kernel'),probability=True, random_state=42)))
                count += 1
count = 0
for i in list(gnb_params.keys()):
                temp_dict = ast.literal_eval(gnb_params[i])
                estimators.append((f'gnb_{count}', GaussianNB(var_smoothing=get_value(temp_dict, 'var_smoothing'))))
                count += 1
count = 0
for i in list(mlp_params.keys()):
                temp_dict = ast.literal_eval(mlp_params[i])
                estimators.append((f'mlp_{count}', MLPClassifier(activation=get_value(temp_dict, 'activation'), 
                                alpha=get_value(temp_dict, 'alpha'),
                                max_iter=get_value(temp_dict, 'max_iter'), solver=get_value(temp_dict, 'solver'), 
                                tol=get_value(temp_dict, 'tol'), random_state=42)))
                count += 1
count = 0
for i in list(lr_params.keys()):
                temp_dict = ast.literal_eval(lr_params[i])
                estimators.append((f'lr_{count}', LogisticRegression(C=get_value(temp_dict, 'C'), max_iter=get_value(temp_dict, 'max_iter'), penalty=get_value(temp_dict, 'penalty'), solver=get_value(temp_dict, 'solver'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(lda_params.keys()):
                temp_dict = ast.literal_eval(lda_params[i])
                estimators.append((f'lda_{count}', LinearDiscriminantAnalysis(shrinkage=get_value(temp_dict, 'shrinkage'), 
                                solver=get_value(temp_dict, 'solver'))))
                count += 1
count = 0
for i in list(qda_params.keys()):
                temp_dict = ast.literal_eval(qda_params[i])
                estimators.append((f'qda_{count}', QuadraticDiscriminantAnalysis(reg_param=get_value(temp_dict, 'reg_param'), tol=get_value(temp_dict, 'tol'))))
                count += 1
count = 0
for i in list(rf_params.keys()):
                temp_dict = ast.literal_eval(rf_params[i])
                estimators.append((f'rf_{count}', RandomForestClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(et_params.keys()):
                temp_dict = ast.literal_eval(et_params[i])
                estimators.append((f'et_{count}', ExtraTreesClassifier(criterion=get_value(temp_dict, 'criterion'),
                                n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(ab_params.keys()):
                temp_dict = ast.literal_eval(ab_params[i])
                estimators.append((f'ab_{count}', AdaBoostClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                count += 1
count = 0
for i in list(gb_params.keys()):
                temp_dict = ast.literal_eval(gb_params[i])
                estimators.append((f'gb_{count}', GradientBoostingClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                count += 1

##### Metamodel estimators (top 1 per algorithm)
Hyperparameters will be extracted from the best performing model per algorithm. <br> The plan is to extract hyperparameters from the best performing model per algorithm and then apply these hyperparameters to the metamodels. Currently no functuonality to be added to allow end user to tune the hyperparameters for metamodels, instead they will rely on top performing ones from base layer models.

In [30]:
# Extract the top models from the dataframe with the respective hyperparameters
meta_params = best_params(df_model)

# return unique values of algorithm name
algo = ['knn', 'svm', 'gnb', 'mlp', 'lr', 'lda', 'qda', 'rf', 'et', 'ab', 'gb']
algo_names = meta_params['algorithm_name']

# iterate through key and value pairs in algo
i = 0
for k,v in zip(algo, algo_names):
    # Define hyperparameters for each  final estimator, based on top performing model from base layer
    temp = get_hyperparameters(meta_params, v)
    # convert to dictionary for easy access
    temp_dict = {int(k):v for k,v in temp.items()}
    # return first value
    temp_dict = temp_dict[i]
    # convert to dictionary for easy access, assign varibale name per algorithm
    globals()[k + '_params'] = ast.literal_eval(temp_dict)
    i += 1

In [31]:
# Add final estimators
final_estimators = [('knn', KNeighborsClassifier(algorithm=get_value(knn_params, 'algorithm'), metric=get_value(knn_params, 'metric'), 
                n_neighbors= get_value(knn_params, 'n_neighbors'), weights=get_value(knn_params, 'weights'), n_jobs=-1)),
                ('svm', SVC(C=get_value(svm_params, 'C'), kernel=get_value(svm_params, 'kernel'),probability=True, random_state=42)),
                ('gnb', GaussianNB(var_smoothing=get_value(gnb_params, 'var_smoothing'))),
                ('mlp', MLPClassifier(activation=get_value(mlp_params, 'activation'), alpha=get_value(mlp_params, 'alpha'), 
                max_iter=get_value(mlp_params, 'max_iter'), solver=get_value(mlp_params, 'solver'), tol=get_value(mlp_params, 'tol'), 
                random_state=42)),
                ('lr', LogisticRegression(C=get_value(lr_params, 'C'), max_iter=get_value(lr_params, 'max_iter'),
                penalty=get_value(lr_params, 'penalty'), solver=get_value(lr_params, 'solver'), random_state=42, n_jobs=-1)),
                ('lda', LinearDiscriminantAnalysis(shrinkage=get_value(lda_params, 'shrinkage'), solver=get_value(lda_params, 'solver'))),
                ('qda', QuadraticDiscriminantAnalysis(reg_param=get_value(qda_params, 'reg_param'), tol=get_value(qda_params, 'tol'))),
                ('rf', RandomForestClassifier(criterion=get_value(rf_params, 'criterion'), n_estimators=get_value(rf_params, 'n_estimators'), 
                random_state=42, n_jobs=-1)),
                ('et', ExtraTreesClassifier(criterion=get_value(et_params, 'criterion'), n_estimators=get_value(et_params, 'n_estimators'), 
                random_state=42, n_jobs=-1)),
                ('ab', AdaBoostClassifier(algorithm=get_value(ab_params, 'algorithm'), learning_rate=get_value(ab_params, 'learning_rate'), 
                n_estimators=get_value(ab_params, 'n_estimators'), random_state=42)),
                ('gb', GradientBoostingClassifier(criterion=get_value(gb_params, 'criterion'), 
                learning_rate=get_value(gb_params, 'learning_rate'), n_estimators=get_value(gb_params, 'n_estimators'), random_state=42))
                ]

Please note that for the average metric I did not include the logloss as using normalized version of logloss introduces bias to dataset

In [32]:
# create dataframe for df_model_meta with columns names from df_model
df_model_meta = pd.DataFrame(columns=df_model.columns)

# create dataframes for meta models probabilities and predicted values
df_pred_meta = pd.DataFrame()
df_prob_meta = pd.DataFrame()

for x in range(0, len(final_estimators)):
        final_estimator = final_estimators[x][1]
        clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1, cv=5)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_train)
        y_pred = pd.Series(y_pred)

        # probabilities dataframe
        y_pred_prob = clf.predict_proba(x_train)
        y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['prob_0', 'prob_1'])
        y_pred_prob_df['target'] = y_train
        y_pred_prob_df['predicted'] = y_pred
        # create new column, set value to prob_0 if target is 0 and prob_1 if target is 1
        y_pred_prob_df['pred_prob'] = np.where(y_pred_prob_df['target'] == 0, y_pred_prob_df['prob_0'], y_pred_prob_df['prob_1'])
        # remove prob_0 and prob_1 columns, target and predicted columns
        y_pred_prob_df = y_pred_prob_df.drop(['prob_0', 'prob_1', 'target', 'predicted'], axis=1)
        # transpose the data frame and convert values to %
        y_pred_prob_df = y_pred_prob_df.T
        y_pred_prob_df = y_pred_prob_df.apply(lambda x: x * 100).round(2)
        # set index to x
        y_pred_prob_df.index = [x]
        # add row to df_prob_meta using pd.concat
        df_prob_meta = pd.concat([df_prob_meta, y_pred_prob_df], axis=0)
        
        # prediction dataframe
        # transpose the data frame
        y_pred_df = pd.DataFrame(y_pred, columns=['Pred_class'])
        y_pred_df = y_pred_df.T
        # set index to x
        y_pred_df.index = [x]
        # add row to df_prob_meta using pd.concat
        df_pred_meta = pd.concat([df_pred_meta, y_pred_df], axis=0)

        # performance metrics dataframe
        accuracy = round(accuracy_score(y_train, y_pred)*100, 2)
        precision = round(precision_score(y_train, y_pred, average='weighted')*100, 2)
        recall = round(recall_score(y_train, y_pred, average='weighted')*100, 2)
        roc_auc = round(roc_auc_score(y_train, y_pred, average='weighted')*100, 2)
        gmean = round(geometric_mean_score(y_train, y_pred, average='weighted')*100, 2)
        mcc = round(matthews_corrcoef(y_train, y_pred)*100, 2)
        f1_weighted = round(f1_score(y_train, y_pred, average='weighted')*100, 2)
        log_loss = round(metrics.log_loss(y_train, y_pred, normalize=True)*100, 2)
        average_metrics = (accuracy + precision + recall + roc_auc + gmean + mcc + f1_weighted) / 7
        average_metrics = round(average_metrics, 2)
        # add performance metrics to df_model_meta using pd.concat with index
        df_model_meta = pd.concat([df_model_meta, pd.DataFrame([[f'meta_{x+1}', x+1, algos[x+1], accuracy, precision, recall, roc_auc, gmean, \
                        mcc, f1_weighted, log_loss, average_metrics, f'{final_estimator.get_params()}']], 
                        columns=df_model_meta.columns, index=[x])], axis=0)



In [33]:
# Save all dataframes to csv
df_model_meta.to_csv(f'{path}/MetaModelsPerformance.csv', index=False)
df_prob_meta.to_csv(f'{path}/MetaModelsProbabilities.csv', index=False)
df_pred_meta.to_csv(f'{path}/MetaModelsPredictions.csv', index=False)

In [34]:
# concatinate df_model and df_model_meta
df_model_all = pd.concat([df_model, df_model_meta], axis=0)
df_model_all = df_model_all.astype({'model_id': 'str', 'algorithm_id': 'int64', 'accuracy': 'float64', 'precision': 'float64', 
                               'recall': 'float64', 'roc_auc_score': 'float64', 'geometric_mean_score': 'float64', 
                               'matthews_corrcoef': 'float64', 'f1_weighted': 'float64', 'log_loss': 'float64', 
                               'overall_performance': 'float64'})
# create new column "size", set to 2 for rows with "meta" in "model_id", else 1
df_model_all['size'] = np.where(df_model_all['model_id'].str.contains('meta'), 2, 1)
# create new column for text of points
df_model_all['text'] = df_model_all['algorithm_name'] + '<br>' + 'Performance: ' + \
        df_model_all['overall_performance'].astype(str) + '%' + '<br>' + 'Model ID: ' + df_model_all['model_id'].astype(str) + \
        '<br>' + 'Accuracy: ' + df_model_all['accuracy'].astype(str) + '%' + '<br>' + 'Precision: ' + \
        df_model_all['precision'].astype(str) + '%' + '<br>' + 'Recall: ' + df_model_all['recall'].astype(str) + \
        '%' + '<br>' + 'ROC AUC: ' + df_model_all['roc_auc_score'].astype(str) + '<br>' + 'Geometric Mean: ' + \
        df_model_all['geometric_mean_score'].astype(str) + '<br>' + 'Matthews Correlation: ' + \
        df_model_all['matthews_corrcoef'].astype(str) + '<br>' + 'F1: ' + df_model_all['f1_weighted'].astype(str) + \
        '<br>' + 'Log Loss: ' + df_model_all['log_loss'].astype(str)

In [53]:
# show model with best performance
df_model_all.sort_values(by=['overall_performance'], ascending=False)

Unnamed: 0,model_id,algorithm_id,algorithm_name,accuracy,precision,recall,roc_auc_score,geometric_mean_score,matthews_corrcoef,f1_weighted,log_loss,overall_performance,hyperparameters,size,text,UMAP_1_prob,UMAP_2_prob,UMAP_1_metrics,UMAP_2_metrics
4,meta_5,5,Logistic Regression,100.00,100.00,100.00,100.00,100.00,100.00,100.00,0.00,100.00,"{'C': 1.4, 'class_weight': None, 'dual': False...",2,Logistic Regression<br>Performance: 100.0%<br>...,-0.373642,4.871645,10.783731,-6.904717
3,meta_4,4,Multilayer Perceptron,100.00,100.00,100.00,100.00,100.00,100.00,100.00,0.00,100.00,"{'activation': 'relu', 'alpha': 0.00041, 'batc...",2,Multilayer Perceptron<br>Performance: 100.0%<b...,-0.031543,5.243205,10.874353,-7.040355
5,meta_6,6,Linear Discriminant Analysis,99.01,99.01,99.01,98.97,98.97,98.00,99.01,34.20,98.85,"{'covariance_estimator': None, 'n_components':...",2,Linear Discriminant Analysis<br>Performance: 9...,-0.728367,5.196533,10.437907,-6.621888
8,meta_9,9,Extra Trees,96.70,96.85,96.70,96.91,96.91,93.51,96.71,113.99,96.33,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",2,Extra Trees<br>Performance: 96.33%<br>Model ID...,-0.049836,6.102156,9.696796,-6.523041
9,meta_10,10,Adaptive Boosting,96.70,96.74,96.70,96.79,96.79,93.40,96.70,113.99,96.26,"{'algorithm': 'SAMME', 'base_estimator': None,...",2,Adaptive Boosting<br>Performance: 96.26%<br>Mo...,-4.494873,-11.543207,9.628210,-6.622332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,1249,4,Multilayer Perceptron,79.52,79.59,79.52,85.97,81.86,64.80,82.36,17.17,79.09,"{'activation': 'tanh', 'alpha': 0.00061, 'max_...",1,Multilayer Perceptron<br>Performance: 79.09%<b...,8.640389,13.491083,0.022111,29.952684
19,1251,4,Multilayer Perceptron,79.52,79.59,79.52,85.97,81.86,64.80,82.36,17.17,79.09,"{'activation': 'tanh', 'alpha': 0.00061, 'max_...",1,Multilayer Perceptron<br>Performance: 79.09%<b...,8.921660,13.584888,-0.095462,30.591431
27,1999,6,Linear Discriminant Analysis,75.89,76.41,75.89,82.60,75.39,52.04,75.95,31.90,73.45,"{'shrinkage': 0.05, 'solver': 'lsqr'}",1,Linear Discriminant Analysis<br>Performance: 7...,-2.949801,7.539630,0.269526,31.509756
28,2000,6,Linear Discriminant Analysis,75.89,76.41,75.89,82.60,75.39,52.04,75.95,31.90,73.45,"{'shrinkage': 0.5, 'solver': 'lsqr'}",1,Linear Discriminant Analysis<br>Performance: 7...,-2.729314,7.848541,0.409960,31.626429


In [47]:
# set df_prob_meta columns as df_prob columns
df_prob_meta.columns = df_prob.columns

# concatinate df_prob and df_prob_meta
df_prob_all = pd.concat([df_prob, df_prob_meta], axis=0)
df_prob_all.shape

(66, 303)


### UMAP dimension reduction algorithm

In [48]:
# UMAP function
def umap_model(df, parameter_umap_n_neighbors = 5, parameter_umap_min_dist = 0.5, parameter_umap_metric = 'euclidean'):
        umap_model = umap.UMAP(n_neighbors=parameter_umap_n_neighbors, metric=parameter_umap_metric, min_dist=parameter_umap_min_dist)
        #fit transform and convert to dataframe
        df_umap = pd.DataFrame(umap_model.fit_transform(df), columns=['UMAP_1', 'UMAP_2'])
        # add index to df_umap
        df_umap.index = df.index
        return df_umap

In [50]:
# Create UMAP models for probabilities and perfromance metrics dataframes
umap_prob = umap_model(df_prob_all)

# choose only accuracy, precision, recall, roc_auc, geometric_mean_score, matthews_corrcoef, f1_weighted from df_model_all
df_model_metrics = df_model_all[['accuracy', 'precision', 'recall', 'roc_auc_score', 'geometric_mean_score', 'matthews_corrcoef', 'f1_weighted']]

umap_metrics = umap_model(df_model_metrics)



In [51]:
# Add umap_prob to df_model_all and rename columns to 'UMAP_1_prob' and 'UMAP_2_prob'
df_model_all = pd.concat([df_model_all, umap_prob], axis=1)
df_model_all.rename(columns={'UMAP_1': 'UMAP_1_prob', 'UMAP_2': 'UMAP_2_prob'}, inplace=True)
# Add umap_metrics to df_model_all and rename columns to 'UMAP_1_metrics' and 'UMAP_2_metrics'
df_model_all = pd.concat([df_model_all, umap_metrics], axis=1)
df_model_all.rename(columns={'UMAP_1': 'UMAP_1_metrics', 'UMAP_2': 'UMAP_2_metrics'}, inplace=True)

### UMAP chart

In [52]:
fig = go.Figure()

symbols = ['circle', 'square', 'x', 'cross', 'diamond', 'star', 'hexagram', 'triangle-right', 'triangle-left', 'triangle-down', 'triangle-up']
# convert symbols to dictionary with keys from 1 to 11
symbols_dict = dict(zip(range(1, 12), symbols))

fig = make_subplots(rows=1, cols=2, subplot_titles=('UMAP Probabilities', 'UMAP Performance metrics'), shared_yaxes='all', shared_xaxes='all',
                    vertical_spacing=0.03, horizontal_spacing=0.03, specs=[[{'type': 'xy'}, {'type': 'xy'}]])

# Plot UMAP, add hovertext and symbols, define colorscale by performance, add title
fig.add_trace(go.Scatter(x=df_model_all['UMAP_1_prob'], y=df_model_all['UMAP_2_prob'], mode='markers', hovertext=df_model_all['text'], 
            marker=dict(size=df_model_all['size']*20, symbol = df_model_all['algorithm_id'].map(symbols_dict), 
            color=df_model_all['overall_performance'], coloraxis='coloraxis')), row=1, col=1)
fig.add_trace(go.Scatter(x=df_model_all['UMAP_1_metrics'], y=df_model_all['UMAP_2_metrics'], mode='markers', hovertext=df_model_all['text'], 
            marker=dict(size=df_model_all['size']*20, symbol = df_model_all['algorithm_id'].map(symbols_dict), 
            color=df_model_all['overall_performance'], coloraxis='coloraxis')), row=1, col=2)

# remove axes labels

fig.update_layout(xaxis_showticklabels=False, yaxis_showticklabels=False)
fig.update_traces(marker=dict(opacity=0.8, line=dict(width=1, color='Black')), selector=dict(mode='markers'))
# define symbols for markers based on symbols defined
fig.update_layout(hoverlabel=dict(bgcolor="white", font_size=14, font_family="Rockwell"), hovermode='closest')
# define plot as square
fig.update_layout(width=1200, height=600)
# add tooltip
fig.update_layout(hovermode='closest')
fig.update_layout(coloraxis=dict(colorscale='Viridis'), showlegend=False)

fig.show()

### Scatter plot matrix with Correlation

We will operate further only with prediction results, which are not the same for the all 11 meta models; the main idea is to investigate teh differencies and combinations of different metamodels. That can be cosidered as feature engineering to df_pred_meta dataframe (dataframe with all 11 meta models predictions per instance) to keep only the columns, which are not the same for all 11 meta models.

In [None]:
# find columns in df_pred_meta with the not the same values in the whole column
df_pred_meta_red = df_pred_meta.copy()
df_pred_meta_red = df_pred_meta_red.loc[:, df_pred_meta_red.apply(pd.Series.nunique, axis=0) != 1]
df_pred_meta_red

Unnamed: 0,0,20,23,34,42,52,57,73,91,95,...,278,281,282,283,286,287,293,296,299,302
0,0,0,0,0,1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,0,0,1,...,0,1,1,1,0,0,1,1,1,0
2,0,0,0,0,1,1,0,0,1,1,...,0,1,1,1,0,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
6,0,0,0,0,1,1,0,0,0,1,...,0,1,1,0,0,0,1,1,0,0
7,0,0,1,0,1,1,0,0,1,1,...,1,1,1,1,0,1,0,0,1,0
8,0,0,0,0,1,1,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
9,1,1,0,1,0,1,1,1,0,1,...,1,0,1,1,1,1,1,1,1,1


In [None]:
fig = go.Figure()
fig = make_subplots(rows=11, cols=11, vertical_spacing=0.02, horizontal_spacing=0.02, shared_xaxes='all', shared_yaxes='all')

# define subplot size
fig.update_layout(width=1000, height=1000)

# iterate through rows and cols
for i in range(1, 12):
    fig.add_trace(go.Bar(x = df_perf_meta.columns, y = df_perf_meta.iloc[i-1], marker_color=px.colors.sequential.gray, 
                name = f'meta_model_{algo[i-1]}'), row=i, col=i)
    # remove legend and x axes labels
    fig.update_layout(showlegend=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    

fig.show()