## IMPORTS

In [1]:
import pandas as pd, numpy as np
%matplotlib inline
from pycaret.classification import *
from pathlib import Path

## GLOBAL PATH VARIABLES

In [2]:
# Notebook folder
NB_DIR = %pwd
NB_DIR = Path(NB_DIR)

# Root MCI foler
ROOT_DIR = NB_DIR.parent

# Main data folder (with downloaded csv files)
MAIN_DATA_DIR = ROOT_DIR/'data'
DATA_DIR_FS = ROOT_DIR / 'data_FS'

# Current data dir with sMCI_cAD.csv & bl.csv files
CURRENT_DATA_DIR = ROOT_DIR/'results'

# Results folder
RESULTS_DIR = ROOT_DIR/'results' #misclassified patient table
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

## IMOPRTS TO CREATE CONFUSION MATRIX 

In [3]:
from pathlib import Path
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/modules")

In [4]:
# Importing packages needed
import mci_rf_bl as mrfbl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

# Load data

In [5]:
#setting to displaying all columns in pandas df
pd.set_option("display.max_columns", None)

In [6]:
# Data generated in RandomForest-notebook
dataset = pd.read_csv(RESULTS_DIR /'2.0-random_forest_train_test.csv', index_col=0)

In [7]:
data = dataset.loc[dataset.Usage_=='train']
data_unseen = dataset.loc[dataset.Usage_=='test']

In [8]:
data_unseen.head()

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_
6,80.4,30.0,1.0,7.0,49.0,168.0,13.0,0.0,17.0,0.003638,0.0,sMCI,0,test,Female,1
82,77.3,29.0,0.0,11.0,122.0,151.0,17.0,2.0,3.0,0.003343,1.0,cAD,1,test,Male,0
184,77.5,35.0,1.0,10.0,27.0,69.0,24.0,2.0,22.0,0.003149,1.0,cAD,1,test,Female,1
359,71.1,24.0,0.0,2.0,50.0,85.0,13.0,2.0,7.0,0.003729,0.0,cAD,1,test,Female,1
384,83.6,30.0,2.0,9.0,22.0,76.0,18.0,0.0,9.0,0.0037,0.0,cAD,1,test,Female,1


In [9]:
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (539, 16)
Unseen Data For Predictions: (139, 16)


# Load the best PyCaret model created in `3.0-pycaret_pipeline`-notebook

In [11]:
ensemble_model = load_model(RESULTS_DIR /'2COPY_best_blended_accuracy_top5')

Transformation Pipeline and Model Successfully Loaded


ModuleNotFoundError: No module named 'catboost'

In [12]:
ensemble_model

<function pycaret.classification.ensemble_model(estimator, method: str = 'Bagging', fold: Union[int, Any, NoneType] = None, n_estimators: int = 10, round: int = 4, choose_better: bool = False, optimize: str = 'Accuracy', fit_kwargs: Union[dict, NoneType] = None, groups: Union[str, Any, NoneType] = None, verbose: bool = True) -> Any>

# Using this model to predict on unseen test set

In [None]:
preds_ensemble = predict_model(ensemble_model, data=data_unseen)

In [None]:
preds_ensemble.head()

# Tolk resultater

## Finn tp, fp, fn, tn

In [None]:
data_unseen.head()

In [None]:
preds_ensemble.head()

## Adding column `Ens_pred` for further comparison with predictions from Random Forest model (below)

In [None]:
preds_ensemble['Ens_pred'] = "" 

for i in preds_ensemble.index:
    
    if preds_ensemble.loc[i,'Ens_pred'] == "":

        if preds_ensemble.loc[i, 'Subgroup_'] == 'sMCI' and preds_ensemble.loc[i, 'Label'] == 0:
            preds_ensemble.loc[i, 'Ens_pred'] = 'TN_'

        elif preds_ensemble.loc[i, 'Subgroup_'] == 'sMCI' and preds_ensemble.loc[i, 'Label'] == 1:
            preds_ensemble.loc[i, 'Ens_pred'] ='FP_' 

        elif preds_ensemble.loc[i, 'Subgroup_'] == 'cAD' and preds_ensemble.loc[i, 'Label'] == 0:
            preds_ensemble.loc[i, 'Ens_pred'] = 'FN_' 

        elif preds_ensemble.loc[i, 'Subgroup_'] == 'cAD' and preds_ensemble.loc[i, 'Label'] == 1:
            preds_ensemble.loc[i, 'Ens_pred'] ='TP_' 

In [None]:
# Visual inspection for minimum one of each prediction (aka TN_, FP_ FN_ & TP_)
# If 'Subgroup_' == 'sMCI' and 'Label' == 1, then 'Ens_pred' should be 'FP_'
preds_ensemble.head(11)

## Inspection of predictions

In [None]:
# Positive and negative predictions
p = preds_ensemble.loc[preds_ensemble.Label == 1]
n = preds_ensemble.loc[preds_ensemble.Label == 0]

In [None]:
p.head()

In [None]:
n.head()

In [None]:
tn = len(preds_ensemble.loc[preds_ensemble['Ens_pred'] == 'TN_'])
fp = len(preds_ensemble.loc[preds_ensemble['Ens_pred'] == 'FP_'])
fn = len(preds_ensemble.loc[preds_ensemble['Ens_pred'] == 'FN_'])
tp = len(preds_ensemble.loc[preds_ensemble['Ens_pred'] == 'TP_'])

In [None]:
# Misclassification FP + FN / TP + TN + FP + FN
misclass_perc = (fp + fn) / (tp + tn + fp + fn)
print(f"Percentage of test set misclassified: {misclass_perc}%")

In [None]:
# Accuracy (TP + TN) / (TP + TN + FP + FN)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f"Accuracyen for classification on test set: {accuracy}")

In [None]:
# Precision TP / TP + FP
precision = tp / (tp + fp)
print(precision)

In [None]:
# Sensitivity aka Recall (true positives / all actual positives) = TP / TP + FN
recall = tp / (tp + fn)
print(recall)

In [None]:
# Sensitivity TP / (TP + FN)
sensitivity = tp / (tp + fn)
print(sensitivity)

In [None]:
# Specificity (true negatives / all actual negatives) = TN / TN + FP
specificity = tn / (tn + fp)
print(specificity)

# Creating confusion matrix for Ensemble's prediction

In [None]:
# Prepareing 'y_true' from ensemble's prediction --> this is ground truth
y_test = preds_ensemble.Subgroup_num_
y_test.head()

In [None]:
# Preparing y_test_pred from ansemble --> this is from 'Label' column
y_test_pred = preds_ensemble.Label
y_test_pred.head()

In [None]:
# Ploting confusion matrix for test set 
conf_matrix_test_ens  = metrics.confusion_matrix(y_test, y_test_pred)
conf_matrix_test_ens

In [None]:
# Mean of test confusion matix 
# conf_mat_mean = conf_matrix_test_ens.mean(axis=0)
# percantage values of confusion matix according to validatin set lenght

conf_matrix_test_prc = conf_matrix_test_ens / y_test.shape[0] * 100

conf_mat_ens = mrfbl.plot_confusion_matrix_TEST_IR(conf_matrix_test_ens, conf_matrix_test_prc,
                                    file_name_number="K50", title="Ensemble model",
                                    file_name_prefix="3.1-ensemble-conf-matrix",
                                    save=True, results_dir=RESULTS_DIR/'figs')

---

## Load data with predictions from Random Forest for further comparison. 
### File: `results/RandomForest-CV50-predictions.csv`
Loading data frame containing information about the Random Forest model's classification on test set. 

Prediction (i.e TN, FP, FN or TP) is specified by `CM_pred_`-column.

In [None]:
rf_pred = pd.read_csv(RESULTS_DIR / '2.0-random_forest-TEST-predictions.csv', index_col=0)

In [None]:
rf_pred.shape

In [None]:
rf_pred.head()

In [None]:
# Restrict to features needed for further analysis, all meta data is in 'Preds_ensemble'-file
rf_pred = rf_pred[["CM_pred_"]]

In [None]:
rf_pred.head()

In [None]:
# Shape of new file should be 139x141 because we add 'Label' og 'Ensemble_pred', mens Subgruop_ finnes fra før
final_df = pd.concat([rf_pred, preds_ensemble], axis=1)
final_df.shape

In [None]:
final_df.head()

In [None]:
final_df.to_csv( RESULTS_DIR / '3.1-random_forest_n_ensemble_predictions.csv')

In [None]:
# Checking confusion matrix from Random Forest prediction
TN_teller = 0
FP_teller = 0
FN_teller = 0
TP_teller = 0

for i in final_df.index:
    if final_df.loc[i,'CM_pred_'] == 'TN':
        TN_teller += 1
    elif final_df.loc[i,'CM_pred_'] == 'FP':
        FP_teller += 1
    elif final_df.loc[i,'CM_pred_'] == 'FN':
        FN_teller += 1
    elif final_df.loc[i,'CM_pred_'] == 'TP':
        TP_teller += 1
        
print(f"Classification labels from Random Forest prediction on test set:") 
print(f"True Negatives: {TN_teller}")
print(f"False Positives: {FP_teller}")
print(f"False Negatives: {FN_teller}")
print(f"True Positives: {TP_teller}")

In [None]:
# Checking confusion matrix from ENSEMBLE prediction
tn_teller = 0
fp_teller = 0
fn_teller = 0
tp_teller = 0

for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'TN_':
        tn_teller += 1
    elif final_df.loc[i,'Ens_pred'] == 'FP_':
        fp_teller += 1
    elif final_df.loc[i,'Ens_pred'] == 'FN_':
        fn_teller += 1
    elif final_df.loc[i,'Ens_pred'] == 'TP_':
        tp_teller += 1
        
print(f"Classification labels from ensemble model on test set:") 
print(f"True Negatives: {tn_teller}")
print(f"False Positives: {fp_teller}")
print(f"False Negatives: {fn_teller}")
print(f"True Positives: {tp_teller}")

# Investigating the two models' overlap in misclassfications of **sMCI**

In [None]:
fp_overlap = 0 
for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'FP_' and final_df.loc[i,'CM_pred_'] == 'FP':
        fp_overlap += 1
        
print("*"*90)
print(f"Random Forest: misclassified {FP_teller} sMCI subjects as cAD.")
print(f"Ensemblet: misclassified {fp_teller} sMCI subjects as converters.")
print()
print(f"Of these misclassifications the models overlapped {fp_overlap} deltagere.")
print("*"*90)

### Identifying indexes for FP subjects where the models did not overlap  (sMCI --> FP)

In [None]:
# Misclassified as sMCI by the Ensemble and correctly classified by the Random Forest
indeksList_fp_ensemble = []

for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'FP_' and final_df.loc[i,'CM_pred_'] == 'TN':
        indeksList_fp_ensemble.append(i)
        
print("*"*100)      
print(f"Subjects with the {len(indeksList_fp_ensemble)} following indices {indeksList_fp_ensemble} were correctly")
print("classified by the RF and missclassified by the ensemble")
print("*"*100)

In [None]:
# Misclassified as sMCI by the Random Forest and correctly by the Ensemble
indeksList_fp_rf = []
for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'TN_' and final_df.loc[i,'CM_pred_'] == 'FP':
        indeksList_fp_rf.append(i)
        
print("*"*100)
print(f"Subjects with the {len(indeksList_fp_rf)} following indices {indeksList_fp_rf} were correctly")
print("classified by the ensemble and miscassified by the Random Forest")
print("*"*100)

# Investigating the two models' overlap i misclassfications of **cAD**

In [None]:
fn_overlap = 0 

for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'FN_' and final_df.loc[i,'CM_pred_'] == 'FN':
        fn_overlap += 1
        
print("*"*90)
print(f"Random Forest: misclassified {FN_teller} cAD subjects as stabile.")
print(f"Ensemblet: misclassified {fn_teller} cAD as stabile.")
print()
print(f"Of these misclassifications, the models overlapped on {fn_overlap} subjects.")
print("*"*90)

### Identifying indexes for FP subjects where the models did not overlap (cAD --> FN)

In [None]:
# Subjects misclassified as FN by ensemble, and correctly classified as TP by the random forest:
indeksList_fn_ensemble = []
for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'FN_' and final_df.loc[i,'CM_pred_'] == 'TP':
        indeksList_fn_ensemble.append(i)
        
print("*"*100)

print(f"Subjects with the {len(indeksList_fn_ensemble)} following indecies {indeksList_fn_ensemble} were correctly")
print("classified by RF and misclassified by the ensemblet")
print("*"*100)

In [None]:
# Subjects misclassified as FN by the random forest, and correctly classified as TP by the ensemble:
indeksList_fn_rf = []
for i in final_df.index:
    if final_df.loc[i,'Ens_pred'] == 'TP_' and final_df.loc[i,'CM_pred_'] == 'FN':
        indeksList_fn_rf.append(i)
        
print("*"*100)
print(f"Subjects with the {len(indeksList_fn_rf)} following {indeksList_fn_rf} were ble correctly")
print("classified av ensemblet, men feilaktig klassifisert av RF")
print("*"*100)