### Initial Data exploration and wrangling

##### Modul import

In [9]:
import pandas as pd
import umap
from sklearn import preprocessing
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import ast
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, f1_score, log_loss
from sklearn import metrics
from imblearn.metrics import geometric_mean_score
import plotly.subplots as sp

# define random seed
np.random.seed(42)


#### Dataset import and wrangling

In [10]:
path = r'D:\github\2dv50e\Data\1. Heart Disease'
df_probabilities = pd.read_csv(path + r'\topModelsProbabilities.csv')
df_model = pd.read_csv(path + r'\topModels.csv')
# add column target to df model from target.csv
df_target = pd.read_csv(path + r'\target.csv')
y_train = df_target['class']
target = df_target['class'].tolist()
df_source = pd.read_csv(path + r'\dataset.csv')

algo_nr = df_model['algorithm_id']

In [11]:
# Print number of missing values per column in df_probabilities
for col in df_probabilities.columns:
    if df_probabilities[col].isnull().sum() > 0:
        print(f'{col} has {df_probabilities[col].isnull().sum()} missing values.')
    


In [12]:
# Apply scaler to df_source
scaler = preprocessing.StandardScaler()
df_source_scaled = pd.DataFrame(scaler.fit_transform(df_source), columns=df_source.columns)
x_train = df_source_scaled

In [13]:
x_train.head()

Unnamed: 0,Fbs,Slope,Trestbps,Exang,Thalach,Age,Chol,Sex,Oldpeak,Restecg,Cp,Ca,Thal
0,2.394438,-2.274579,0.763956,-0.696631,0.015443,0.952197,-0.256334,0.681005,1.087338,-1.005832,1.973123,-0.714429,-2.148873
1,-0.417635,-2.274579,-0.092738,-0.696631,1.633471,-1.915313,0.072199,0.681005,2.122573,0.898962,1.002577,-0.714429,-0.512922
2,-0.417635,0.976352,-0.092738,-0.696631,0.977514,-1.474158,-0.816773,-1.468418,0.310912,-1.005832,0.032031,-0.714429,-0.512922
3,-0.417635,0.976352,-0.663867,-0.696631,1.239897,0.180175,-0.198357,0.681005,-0.206705,0.898962,0.032031,-0.714429,-0.512922
4,-0.417635,0.976352,-0.663867,1.435481,0.583939,0.290464,2.08205,-1.468418,-0.379244,0.898962,-0.938515,-0.714429,-0.512922



### UMAP dimension reduction algorithm

In [14]:
# UMAP function
def umap_model (df_prob = df_probabilities, df_mod = df_model, parameter_umap_n_neighbors = 5, parameter_umap_min_dist = 0.5, 
                parameter_umap_metric = 'euclidean'):

        algos = {1:'K-Nearest Neighbor', 2:'Support Vector Machine', 3:'Gaussian Naive Bayes', 4:'Multilayer Perceptron', 5:'Logistic Regression',
        6:'Linear Discriminant Analysis', 7:'Quadratic Discriminant Analysis', 8:'Random Forest', 9:'Extra Trees', 10:'Adaptive Boosting',
        11:'Gradient Boosting'}

        umap_model = umap.UMAP(n_neighbors=parameter_umap_n_neighbors, metric=parameter_umap_metric, min_dist=parameter_umap_min_dist)
        umap_embedding = umap_model.fit_transform(df_prob)
        #convert umap_embedding to dataframe
        df_umap = pd.DataFrame(umap_embedding, columns=['UMAP_1', 'UMAP_2'])
        # Add algortim number (keeping in mind the same row structure in topModels.csv and topModelsProbabilities.csv)
        df_umap['algorithm_id'] = df_mod['algorithm_id']
        df_umap['algorithm_name'] = df_umap['algorithm_id'].map(algos)
        df_umap['model_id'] = df_mod['model_id']
        # add hyperparameters column
        df_umap['hyperparameters'] = df_mod['params']
        # Add model specific metrics
        df_umap['accuracy'] = df_model['mean_test_accuracy']
        df_umap['precision'] = df_model['mean_test_precision_weighted']
        df_umap['recall'] = df_model['mean_test_recall_weighted']
        df_umap['roc_auc_score'] = df_model['mean_test_roc_auc_ovo_weighted']
        df_umap['geometric_mean_score'] = df_model['geometric_mean_score_weighted']
        df_umap['matthews_corrcoef'] = df_model['matthews_corrcoef']
        df_umap['f1_weighted'] = df_model['f1_weighted']
        df_umap['log_loss'] = df_model['log_loss']
        df_umap['performance'] = round((df_umap['accuracy'] + df_umap['precision'] + df_umap['recall'] + df_umap['roc_auc_score'] + \
                        df_umap['geometric_mean_score'] + df_umap['matthews_corrcoef'] + df_umap['f1_weighted']) / 7, 2)
        df_umap = df_umap.astype({'UMAP_1': 'float64', 'UMAP_2': 'float64', 'performance': 'float64', 'algorithm_id': 'int64',
                                    'algorithm_name': 'str', 'model_id': 'str', 'hyperparameters': 'str'})
        # create new column "size", set to 40 for rows with "meta" in "model_id", else 20
        df_umap['size'] = np.where(df_umap['model_id'].str.contains('meta'), 2, 1)
        # create new column for text of points
        df_umap['text'] = df_umap['algorithm_name'] + '<br>' + 'Performance: ' + \
                df_umap['performance'].astype(str) + '%' + '<br>' + 'Model ID: ' + df_umap['model_id'].astype(str) + \
                '<br>' + 'Accuracy: ' + df_umap['accuracy'].astype(str) + '%' + '<br>' + 'Precision: ' + \
                df_umap['precision'].astype(str) + '%' + '<br>' + 'Recall: ' + df_umap['recall'].astype(str) + \
                '%' + '<br>' + 'ROC AUC: ' + df_umap['roc_auc_score'].astype(str) + '<br>' + 'Geometric Mean: ' + \
                df_umap['geometric_mean_score'].astype(str) + '<br>' + 'Matthews Correlation: ' + \
                df_umap['matthews_corrcoef'].astype(str) + '<br>' + 'F1: ' + df_umap['f1_weighted'].astype(str) + \
                '<br>' + 'Log Loss: ' + df_umap['log_loss'].astype(str)
        # drop metrics that are not needed
        df_umap = df_umap.drop(columns=['accuracy', 'precision', 'recall', 'roc_auc_score', 'geometric_mean_score', 
                                        'matthews_corrcoef', 'f1_weighted', 'log_loss'])

        return df_umap

In [20]:
# UMAP plot function
def umap_plot(df_umap):
        # Define symbols for each algorithm
        symbols = ['circle', 'square', 'x', 'cross', 'diamond', 'star', 'hexagram', 'triangle-right', 'triangle-left', 'triangle-down', 'triangle-up']
        # Plot UMAP, add hovertext and symbols, define colorscale by performance, add title
        fig = px.scatter(df_umap, x='UMAP_1', y='UMAP_2', color='performance', size='size', hover_name='text',
                symbol = 'algorithm_id', symbol_sequence = symbols, labels=dict(UMAP_1='', UMAP_2='', performance='Performance'),
                color_continuous_scale=px.colors.sequential.Viridis)
        fig.update_layout(title_text='UMAP Plot')
        fig.update_layout(showlegend=False)
        fig.update_traces(marker=dict(opacity=0.8, line=dict(width=1, color='Black')), selector=dict(mode='markers'))
        # Remove axis labels
        fig.update_layout(xaxis=dict(showticklabels=False), yaxis=dict(showticklabels=False))
        # Add hovertext to legend
        fig.update_layout(hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell"),
                hovermode='closest',
                legend=dict(x=0.1, y=1.1, traceorder="normal", font=dict(family="sans-serif", size=12, color="black"),
                bgcolor="LightSteelBlue",
                bordercolor="Black",
                borderwidth=2)
        )
        # define plot as square
        fig.update_layout(width=600, height=600)
        # add tooltip
        fig.update_layout(hovermode='closest')
        return fig

### Stacking Classifier ensemble

Function to extract hyperparameters from the best performing model per algorithm

In [21]:
def best_params(df_umap):
      # Select hyperparameters for best model in each algorithm
      df_umap_best = df_umap.groupby('algorithm_id').apply(lambda x: x.sort_values('performance', ascending=False).iloc[0])
      # reset algorithm_nr as  index
      df_umap_best = df_umap_best.reset_index(drop=True)
      # keep only algorithm number, name, performance and hyperparameters
      df_umap_best = df_umap_best[['algorithm_id', 'algorithm_name', 'performance', 'hyperparameters']]
      return df_umap_best

Supporting functions to extract hyperparameters values

In [22]:
# Return hyperparameters based on algorithm name from df_umap_best
def get_hyperparameters(df_umap, algorithm_name):
    return df_umap[df_umap['algorithm_name'] == algorithm_name]['hyperparameters']

# return value of key in dictionary
def get_value(dictionary, key):
    return dictionary[key]

# convert string to dictionary
def string_to_dict(string):
    return ast.literal_eval(string)


Create umap dimension reduction dataset for original base layer model probabilities

In [23]:
df_umap = umap_model(df_probabilities, df_model)


Graph is not fully connected, spectral embedding may not work as expected.



Base layer estimators (top 5 per algorithm)

In [24]:
# dictionaries with algorithm names and their hyperparameters

knn_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'K-Nearest Neighbor').items()}
svm_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Support Vector Machine').items()}
gnb_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Gaussian Naive Bayes').items()}
mlp_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Multilayer Perceptron').items()}
lr_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Logistic Regression').items()}
lda_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Linear Discriminant Analysis').items()}
qda_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Quadratic Discriminant Analysis').items()}
rf_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Random Forest').items()}
et_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Extra Trees').items()}
ab_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Adaptive Boosting').items()}
gb_params = {int(k):v for k,v in get_hyperparameters(df_umap, 'Gradient Boosting').items()}

In [25]:
# Estimators and hyperparameters for each algorithm
estimators = []
# populate list of estimators with all 55 top models
count = 0
for i in list(knn_params.keys()):
                temp_dict = ast.literal_eval(knn_params[i])
                estimators.append((f'knn_{count}', KNeighborsClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                metric=get_value(temp_dict, 'metric'), n_neighbors= get_value(temp_dict, 'n_neighbors'), 
                                weights=get_value(temp_dict, 'weights'), n_jobs=-1)))
                count += 1
count = 0
for i in list(svm_params.keys()):
                temp_dict = ast.literal_eval(svm_params[i])
                estimators.append((f'svm_{count}', SVC(C=get_value(temp_dict, 'C'), kernel=get_value(temp_dict, 'kernel'),probability=True, random_state=42)))
                count += 1
count = 0
for i in list(gnb_params.keys()):
                temp_dict = ast.literal_eval(gnb_params[i])
                estimators.append((f'gnb_{count}', GaussianNB(var_smoothing=get_value(temp_dict, 'var_smoothing'))))
                count += 1
count = 0
for i in list(mlp_params.keys()):
                temp_dict = ast.literal_eval(mlp_params[i])
                estimators.append((f'mlp_{count}', MLPClassifier(activation=get_value(temp_dict, 'activation'), 
                                alpha=get_value(temp_dict, 'alpha'),
                                max_iter=get_value(temp_dict, 'max_iter'), solver=get_value(temp_dict, 'solver'), 
                                tol=get_value(temp_dict, 'tol'), random_state=42)))
                count += 1
count = 0
for i in list(lr_params.keys()):
                temp_dict = ast.literal_eval(lr_params[i])
                estimators.append((f'lr_{count}', LogisticRegression(C=get_value(temp_dict, 'C'), max_iter=get_value(temp_dict, 'max_iter'), penalty=get_value(temp_dict, 'penalty'), solver=get_value(temp_dict, 'solver'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(lda_params.keys()):
                temp_dict = ast.literal_eval(lda_params[i])
                estimators.append((f'lda_{count}', LinearDiscriminantAnalysis(shrinkage=get_value(temp_dict, 'shrinkage'), 
                                solver=get_value(temp_dict, 'solver'))))
                count += 1
count = 0
for i in list(qda_params.keys()):
                temp_dict = ast.literal_eval(qda_params[i])
                estimators.append((f'qda_{count}', QuadraticDiscriminantAnalysis(reg_param=get_value(temp_dict, 'reg_param'), tol=get_value(temp_dict, 'tol'))))
                count += 1
count = 0
for i in list(rf_params.keys()):
                temp_dict = ast.literal_eval(rf_params[i])
                estimators.append((f'rf_{count}', RandomForestClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(et_params.keys()):
                temp_dict = ast.literal_eval(et_params[i])
                estimators.append((f'et_{count}', ExtraTreesClassifier(criterion=get_value(temp_dict, 'criterion'),
                                n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42, n_jobs=-1)))
                count += 1
count = 0
for i in list(ab_params.keys()):
                temp_dict = ast.literal_eval(ab_params[i])
                estimators.append((f'ab_{count}', AdaBoostClassifier(algorithm=get_value(temp_dict, 'algorithm'), 
                                learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                count += 1
count = 0
for i in list(gb_params.keys()):
                temp_dict = ast.literal_eval(gb_params[i])
                estimators.append((f'gb_{count}', GradientBoostingClassifier(criterion=get_value(temp_dict, 'criterion'), 
                                learning_rate=get_value(temp_dict, 'learning_rate'), n_estimators=get_value(temp_dict, 'n_estimators'), random_state=42)))
                count += 1

##### Metamodel estimators (top 1 per algorithm)
Hyperparameters will be extracted from the best performing model per algorithm. <br> The plan is to extract hyperparameters from the best performing model per algorithm and then apply these hyperparameters to the metamodels. Currently no functuonality to be added to allow end user to tune the hyperparameters for metamodels, instead they will rely on top performing ones from base layer models.

In [26]:
# Extrude the top models from the dataframe with the respective hyperparameters
meta_params = best_params(df_umap)
meta_params.head()

Unnamed: 0,algorithm_id,algorithm_name,performance,hyperparameters
0,1,K-Nearest Neighbor,83.35,"{'algorithm': 'ball_tree', 'metric': 'euclidea..."
1,2,Support Vector Machine,83.1,"{'C': 1.75, 'kernel': 'rbf'}"
2,3,Gaussian Naive Bayes,80.97,{'var_smoothing': 0.0}
3,4,Multilayer Perceptron,79.22,"{'activation': 'relu', 'alpha': 0.00041, 'max_..."
4,5,Logistic Regression,83.67,"{'C': 1.4, 'max_iter': 50, 'penalty': 'l2', 's..."


In [27]:
algo = ['knn', 'svm', 'gnb', 'mlp', 'lr', 'lda', 'qda', 'rf', 'et', 'ab', 'gb']
# return unique values of algorithm name
meta_params = best_params(df_umap)
algo_names = meta_params['algorithm_name'].unique()

# iterate through key and value pairs in algo_dict
i = 0
for k,v in zip(algo, algo_names):
    # Define hyperparameters for each  final estimator, based on top performing model from base layer
    temp = get_hyperparameters(meta_params, v)
    # convert to dictionary for easy access
    temp_dict = {int(k):v for k,v in temp.items()}
    # return first value
    temp_dict = temp_dict[i]
    # convert to dictionary for easy access, assign varibale name per algorithm
    globals()['best_' +  k + '_params'] = ast.literal_eval(temp_dict)
    i += 1

In [28]:
# Add final estimators
final_estimators = [('knn', KNeighborsClassifier(algorithm=get_value(best_knn_params, 'algorithm'), metric=get_value    (best_knn_params, 'metric'), n_neighbors= get_value(best_knn_params, 'n_neighbors'), weights=get_value(best_knn_params, 'weights'), n_jobs=-1)),
        ('svm', SVC(C=get_value(best_svm_params, 'C'), kernel=get_value(best_svm_params, 'kernel'),probability=True, random_state=42)),
        ('gnb', GaussianNB(var_smoothing=get_value(best_gnb_params, 'var_smoothing'))),
        ('mlp', MLPClassifier(activation=get_value(best_mlp_params, 'activation'), alpha=get_value(best_mlp_params, 'alpha'), max_iter=get_value(best_mlp_params, 'max_iter'), solver=get_value(best_mlp_params, 'solver'), tol=get_value(best_mlp_params, 'tol'), random_state=42)),
        ('lr', LogisticRegression(C=get_value(best_lr_params, 'C'), max_iter=get_value(best_lr_params, 'max_iter'),
        penalty=get_value(best_lr_params, 'penalty'), solver=get_value(best_lr_params, 'solver'), random_state=42, n_jobs=-1)),
        ('lda', LinearDiscriminantAnalysis(shrinkage=get_value(best_lda_params, 'shrinkage'), solver=get_value(best_lda_params, 'solver'))),
        ('qda', QuadraticDiscriminantAnalysis(reg_param=get_value(best_qda_params, 'reg_param'), tol=get_value(best_qda_params, 'tol'))),
        ('rf', RandomForestClassifier(criterion=get_value(best_rf_params, 'criterion'), n_estimators=get_value(best_rf_params, 'n_estimators'), random_state=42, n_jobs=-1)),
        ('et', ExtraTreesClassifier(criterion=get_value(best_et_params, 'criterion'), n_estimators=get_value(best_et_params, 'n_estimators'), random_state=42, n_jobs=-1)),
        ('ab', AdaBoostClassifier(algorithm=get_value(best_ab_params, 'algorithm'), learning_rate=get_value(best_ab_params, 'learning_rate'), n_estimators=get_value(best_ab_params, 'n_estimators'), random_state=42)),
        ('gb', GradientBoostingClassifier(criterion=get_value(best_gb_params, 'criterion'), learning_rate=get_value(best_gb_params, 'learning_rate'), n_estimators=get_value(best_gb_params, 'n_estimators'), random_state=42))
        ]

Please note that for the average metric I did not include the logloss as using normalized version of logloss introduces bias to dataset

In [29]:
# create dataframe for df_model_meta with columns names from df_model
df_model_meta = pd.DataFrame(columns=df_model.columns)

# create dataframes for meta models probabilities and predicted values
df_pred_meta = pd.DataFrame()
df_prob_meta = pd.DataFrame()

for x in range(0, len(final_estimators)):
        final_estimator = final_estimators[x][1]
        clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1, cv=5)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_train)
        y_pred = pd.Series(y_pred)

        # probabilities dataframe
        y_pred_prob = clf.predict_proba(x_train)
        y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['prob_0', 'prob_1'])
        y_pred_prob_df['target'] = y_train
        y_pred_prob_df['predicted'] = y_pred
        # create new column, set value to prob_0 if target is 0 and prob_1 if target is 1
        y_pred_prob_df['pred_prob'] = np.where(y_pred_prob_df['target'] == 0, y_pred_prob_df['prob_0'], y_pred_prob_df['prob_1'])
        # save 
        # remove prob_0 and prob_1 columns, target and predicted columns
        y_pred_prob_df = y_pred_prob_df.drop(['prob_0', 'prob_1', 'target', 'predicted'], axis=1)
        # transpose the data frame and convert values to %
        y_pred_prob_df = y_pred_prob_df.T
        y_pred_prob_df = y_pred_prob_df.apply(lambda x: x * 100).round(2)
        # set index to x
        y_pred_prob_df.index = [x]
        # add row to df_prob_meta using pd.concat
        df_prob_meta = pd.concat([df_prob_meta, y_pred_prob_df], axis=0)
        
        # prediction dataframe
        # transpose the data frame
        y_pred_df = pd.DataFrame(y_pred, columns=['Pred_class'])
        y_pred_df = y_pred_df.T
        # set index to x
        y_pred_df.index = [x]
        # add row to df_prob_meta using pd.concat
        df_pred_meta = pd.concat([df_pred_meta, y_pred_df], axis=0)

        # performance metrics dataframe
        accuracy = round(accuracy_score(y_train, y_pred)*100, 2)
        precision = round(precision_score(y_train, y_pred, average='weighted')*100, 2)
        recall = round(recall_score(y_train, y_pred, average='weighted')*100, 2)
        roc_auc = round(roc_auc_score(y_train, y_pred, average='weighted')*100, 2)
        gmean = round(geometric_mean_score(y_train, y_pred, average='weighted')*100, 2)
        mcc = round(matthews_corrcoef(y_train, y_pred)*100, 2)
        f1_weighted = round(f1_score(y_train, y_pred, average='weighted')*100, 2)
        log_loss = round(metrics.log_loss(y_train, y_pred, normalize=True)*100, 2)
        average_metrics = (accuracy + precision + recall + roc_auc + gmean + mcc + f1_weighted) / 7
        average_metrics = round(average_metrics, 2)
        # add performance metrics to df_model_meta using pd.concat with index
        df_model_meta = pd.concat([df_model_meta, pd.DataFrame([[f'meta_{x+1}', x+1, accuracy, precision, recall, roc_auc, gmean, 
                        mcc, f1_weighted, log_loss, average_metrics, f'{final_estimator.get_params()}']], 
                        columns=df_model_meta.columns, index=[x])], axis=0)


Stochastic Optimizer: Maximum iterations (100) reached and the optimization hasn't converged yet.


The max_iter was reached which means the coef_ did not converge


Variables are collinear



In [30]:
# Save all dataframes to csv
df_model_meta.to_csv(f'{path}/MetaModelsPerformance.csv', index=False)
df_prob_meta.to_csv(f'{path}/MetaModelsProbabilities.csv', index=False)
df_pred_meta.to_csv(f'{path}/MetaModelsPredictions.csv', index=False)


Create umap dimension reduction dataset for metamodel probabilities and concatinate with original base layer dataset

In [31]:
df_umap_meta = umap_model(df_prob_meta, df_model_meta)

# concat df_umap and df_umap_meta
df_umap_all = pd.concat([df_umap, df_umap_meta], axis=0)

#### UMAP for model metrics

Create umap dimension reduction dataset for base and metamodel perfomance metrics and plot the UMAP chart

In [32]:
perf_columns = ['mean_test_accuracy', 'mean_test_precision_weighted', 'mean_test_recall_weighted', 
            'mean_test_roc_auc_ovo_weighted', 'geometric_mean_score_weighted', 'matthews_corrcoef', 'f1_weighted', ]

df_perf = df_model[perf_columns]
df_perf_meta = df_model_meta[perf_columns]

df_umap_perf = umap_model(df_perf, df_model)
df_umap_perf_meta = umap_model(df_perf_meta, df_model_meta)


# concat df_umap and df_umap_meta
df_umap_perf_all = pd.concat([df_umap_perf, df_umap_perf_meta], axis=0)

In [35]:
# show  top 5 models with best performance
df_umap_perf_all.sort_values(by=['performance'], ascending=False).head()

Unnamed: 0,UMAP_1,UMAP_2,algorithm_id,algorithm_name,model_id,hyperparameters,performance,size,text
40,8.213637,6.249132,9,Extra Trees,2667,"{'criterion': 'entropy', 'n_estimators': 132}",92.27,1,Extra Trees<br>Performance: 92.27%<br>Model ID...
42,8.455186,6.833106,9,Extra Trees,2660,"{'criterion': 'entropy', 'n_estimators': 126}",92.24,1,Extra Trees<br>Performance: 92.24%<br>Model ID...
41,8.881766,6.849137,9,Extra Trees,2659,"{'criterion': 'entropy', 'n_estimators': 125}",92.24,1,Extra Trees<br>Performance: 92.24%<br>Model ID...
43,8.98754,5.965116,9,Extra Trees,2714,"{'criterion': 'gini', 'n_estimators': 111}",92.19,1,Extra Trees<br>Performance: 92.19%<br>Model ID...
44,8.63998,6.307318,9,Extra Trees,2715,"{'criterion': 'gini', 'n_estimators': 112}",92.19,1,Extra Trees<br>Performance: 92.19%<br>Model ID...


### UMAP chart

In [33]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = go.Figure()


symbols = ['circle', 'square', 'x', 'cross', 'diamond', 'star', 'hexagram', 'triangle-right', 'triangle-left', 'triangle-down', 'triangle-up']
# convert symbols to dictionary with keys from 1 to 11
symbols_dict = dict(zip(range(1, 12), symbols))

fig = make_subplots(rows=1, cols=2, subplot_titles=('UMAP Probabilities', 'UMAP Performance metrics'), shared_yaxes='all', shared_xaxes='all',
                    vertical_spacing=0.03, horizontal_spacing=0.03, specs=[[{'type': 'xy'}, {'type': 'xy'}]])

# Plot UMAP, add hovertext and symbols, define colorscale by performance, add title
fig.add_trace(go.Scatter(x=df_umap_all['UMAP_1'], y=df_umap_all['UMAP_2'], mode='markers', hovertext=df_umap_all['text'], 
            marker=dict(size=df_umap_all['size']*20, symbol = df_umap_all['algorithm_id'].map(symbols_dict), 
            color=df_umap_all['performance'], coloraxis='coloraxis')), row=1, col=1)
fig.add_trace(go.Scatter(x=df_umap_perf_all['UMAP_1'], y=df_umap_perf_all['UMAP_2'], mode='markers', hovertext=df_umap_perf_all['text'], 
            marker=dict(size=df_umap_perf_all['size']*20, symbol = df_umap_perf_all['algorithm_id'].map(symbols_dict), 
            color=df_umap_perf_all['performance'], coloraxis='coloraxis')), row=1, col=2)

# remove axes labels

fig.update_layout(xaxis_showticklabels=False, yaxis_showticklabels=False)
fig.update_traces(marker=dict(opacity=0.8, line=dict(width=1, color='Black')), selector=dict(mode='markers'))
# define symbols for markers based on symbols defined
fig.update_layout(hoverlabel=dict(bgcolor="white", font_size=14, font_family="Rockwell"), hovermode='closest')
# define plot as square
fig.update_layout(width=1200, height=600)
# add tooltip
fig.update_layout(hovermode='closest')
fig.update_layout(coloraxis=dict(colorscale='Viridis'), showlegend=False)

fig.show()

It seems like we are getting worse results for our stacked classifier compared to the original base layer models, particularly extra trees classifier, random forest and gradient boosting classifier. <br>
It is important to keep in mind that no hyper-parameter tuning has been done on metamo-model layer, which is why the results are not as good as the original base layer models.The overall idea of stacking classifier is to combine the base layer models  by complementing their strengths. Looking on the graph, one can see that some of the original base layer models performing much worse then others (LDA vs Random Forest). In this case the LDA is not giving any additional benefit to the stacking classifier.

### Scatter plot matrix with Correlation

We will operate further only with prediction results, which are not the same for the all 11 meta models; the main idea is to investigate teh differencies and combinations of different metamodels. That can be cosidered as feature engineering to df_pred_meta dataframe (dataframe with all 11 meta models predictions per instance) to keep only the columns, which are not the same for all 11 meta models.

In [None]:
# find columns in df_pred_meta with the not the same values in the whole column
df_pred_meta_red = df_pred_meta.copy()
df_pred_meta_red = df_pred_meta_red.loc[:, df_pred_meta_red.apply(pd.Series.nunique, axis=0) != 1]
df_pred_meta_red

Unnamed: 0,0,20,23,34,42,52,57,73,91,95,...,278,281,282,283,286,287,293,296,299,302
0,0,0,0,0,1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,0,0,1,...,0,1,1,1,0,0,1,1,1,0
2,0,0,0,0,1,1,0,0,1,1,...,0,1,1,1,0,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
6,0,0,0,0,1,1,0,0,0,1,...,0,1,1,0,0,0,1,1,0,0
7,0,0,1,0,1,1,0,0,1,1,...,1,1,1,1,0,1,0,0,1,0
8,0,0,0,0,1,1,0,0,0,1,...,1,1,1,1,1,1,1,1,1,1
9,1,1,0,1,0,1,1,1,0,1,...,1,0,1,1,1,1,1,1,1,1


In [None]:
fig = go.Figure()
fig = make_subplots(rows=11, cols=11, vertical_spacing=0.02, horizontal_spacing=0.02, shared_xaxes='all', shared_yaxes='all')

# define subplot size
fig.update_layout(width=1000, height=1000)

# iterate through rows and cols
for i in range(1, 12):
    fig.add_trace(go.Bar(x = df_perf_meta.columns, y = df_perf_meta.iloc[i-1], marker_color=px.colors.sequential.gray, 
                name = f'meta_model_{algo[i-1]}'), row=i, col=i)
    # remove legend and x axes labels
    fig.update_layout(showlegend=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    

fig.show()