---

# Process the Performance Report

---

In [32]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib
import config
import time

sys.path.insert(1, config.package_path)
import ml_analysis as mlanlys

In [33]:
start_time = time.time()

In [34]:
# Path to performance report
optimization_report         = config.optimization_report

---

## 1. Read Optimization report

---

In [35]:
if os.path.exists(optimization_report):
    # Load Performance Report
    with open(optimization_report, 'rb') as file: perf_report = pickle.load(file)
    print(f"The file {optimization_report} exists and the Performance Report was read successfully")
else:
    print(f"******************************************************")
    print(f"The file: '{optimization_report}' DOES NOT EXIST")
    print(f"******************************************************")

The file optimize/optimization_report.pkl exists and the Performance Report was read successfully


---

## 2. Write the Performance Report

---

In [36]:

perf_datasets_in = perf_report.keys()

perf_datasets_in

dict_keys(['RandomizedSearchCV_binary_standard_undersample_DecisionTreeClassifier', 'RandomizedSearchCV_binary_standard_undersample_LogisticRegression'])

#### 2.1 Modify the order of the reported datasets

---

In [37]:
# Use this cell to re-order the datasets for the report

perf_datasets = perf_datasets_in

In [38]:
# Remove bad datasets if needed
#del perf_report['Base Dataset']

#### 2.2 Print the report

---

In [39]:
def print_performance_report(perf_report):
    # Print header and then every performance report in the reports dictionary

    print(f"**********************************************************")
    print(f"Diabetes Predictions Optimization Summary Report")
    print(f"**********************************************************")

    print(f"----------------------------------------")
    print(f"Report Summary")
    print(f"----------------------------------------")

    report_keys = list(perf_report.keys())
    report_keys.sort()
    
    for report in report_keys:
        print(f"Performance for: {report}")

    print(f"----------------------------------------")

    for report in report_keys:
        report_dict = perf_report[report]
        dataset_size = report_dict['dataset_size']
        report_df = report_dict['report']
        report_df.rename(columns={'new_column':'dataset'}, inplace=True)

        print(f"----------------------------------------")
        print(f"Performance for: {report}")
        print(f"Dataset Size:    {dataset_size[0]} Rows, {dataset_size[1]} Columns")
        print(f"----------------------------------------")
        print(f"{report_df.to_string(index=False)}")

In [40]:
print_performance_report(perf_report)

**********************************************************
Diabetes Predictions Optimization Summary Report
**********************************************************
----------------------------------------
Report Summary
----------------------------------------
Performance for: RandomizedSearchCV_binary_standard_undersample_DecisionTreeClassifier
Performance for: RandomizedSearchCV_binary_standard_undersample_LogisticRegression
----------------------------------------
----------------------------------------
Performance for: RandomizedSearchCV_binary_standard_undersample_DecisionTreeClassifier
Dataset Size:    2 Rows, 16 Columns
----------------------------------------
                    dataset                  model        slice  score  balanced_accuracy  roc_auc_score  Mean Squared Error  Accuracy  Precision  Recall  F1-score  Specificity  False Positive Rate  Matthews Correlation Coefficient          Optimizer                                                                      

In [41]:
# Generate the Performance Report and send prints to osc.stdout
with mlanlys.OutStreamCapture() as osc:
    print_performance_report(perf_report)

In [42]:
# osc.stdout contains the details of the performance report
# write the performance report to the performance_report_text_file
with open(performance_report_text_file, "w") as file:
    file.write(osc.stdout)

NameError: name 'performance_report_text_file' is not defined

In [None]:
# Display the performance report here:
print(osc.stdout)

**********************************************************
Diabetes Predictions Performance Summary Report
**********************************************************
----------------------------------------
Report Summary
----------------------------------------
Performance for: 1 Base Dataset
Performance for: 2.0 StandardScaler Dataset
Performance for: 2.1 MinMaxScaler Dataset
Performance for: 3 Binary Dataset
Performance for: 4 RandomUnderSampler Dataset
Performance for: 5 RandomOverSampler Dataset
Performance for: 6 ClusterCentroids Dataset
Performance for: 7 SMOTE Dataset
Performance for: 8 SMOTEENN Dataset
----------------------------------------
----------------------------------------
Performance for: 1 Base Dataset
Dataset Size:    247076 Rows, 37 Columns
----------------------------------------
       dataset                      model slice  score  balanced_accuracy  roc_auc_score  Mean Squared Error  Accuracy  Precision  Recall  F1-score  Specificity  False Positive Rate  Ma

In [None]:
print(f"Completed: Execution Time %s seconds:" % round((time.time() - start_time),2) )

Completed: Execution Time 0.15 seconds:


## 3. Merge Performance Metrics into a single dataframe

In [None]:
# Concatenate all the Reports into one dataframe
df = pd.DataFrame()

report_keys = list(perf_report.keys())
report_keys.sort()

for report in report_keys:
    report_dict = perf_report[report]
    report_df = report_dict['report']
    df = pd.concat([df, report_df], ignore_index=True)

In [None]:
df

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
0,1 Base Dataset,KNeighborsClassifier,Train,0.8743,0.4599,0.9362,0.4353,0.8959,0.7259,0.3544,0.4763,0.9794,0.0206,0.4599
1,1 Base Dataset,KNeighborsClassifier,Test,0.8135,0.3548,0.5604,0.6743,0.8350,0.2509,0.1162,0.1588,0.9463,0.0537,0.0882
2,1 Base Dataset,DecisionTreeClassifier,Train,1.0000,1.0000,1.0000,0.0000,1.0000,1.0000,1.0000,1.0000,1.0000,0.0000,
3,1 Base Dataset,DecisionTreeClassifier,Test,0.7672,0.4079,0.5714,0.7853,0.8046,0.2919,0.3211,0.3058,0.8794,0.1206,0.1927
4,1 Base Dataset,RandomForestClassifier,Train,1.0000,0.9997,1.0000,0.0001,1.0000,1.0000,0.9999,0.9999,1.0000,0.0000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,8 SMOTEENN Dataset,GradientBoostingClassifier,Test,0.8723,0.5744,0.8356,0.1277,0.8723,0.5810,0.1674,0.2599,0.9813,0.0187,0.2630
122,8 SMOTEENN Dataset,AdaBoostClassifier,Train,0.8707,0.5822,0.8348,0.1293,0.8707,0.5468,0.1884,0.2802,0.9759,0.0241,0.2668
123,8 SMOTEENN Dataset,AdaBoostClassifier,Test,0.8715,0.5849,0.8325,0.1285,0.8715,0.5580,0.1935,0.2874,0.9763,0.0237,0.2748
124,8 SMOTEENN Dataset,LogisticRegression,Train,0.8694,0.5648,0.8252,0.1306,0.8694,0.5404,0.1493,0.2340,0.9804,0.0196,0.2340


## 4. Filter and sort test results

In [None]:
def rank_results(df, maximize, limits):
    # Sort Dataframe by column to maximize

    # Create dataframe with only the test metrics
    test_df = df[df['slice'] == 'Test']

    limit_df = test_df.copy()

    # limit the rows based on the values in the limits dictionary above
    for limit in limits:
        # print(limit)
        limit_value = limits[limit]
        limit_df = limit_df[limit_df[limit]>limit_value]

    # sort by column_to_maximize fro highest to lowest
    sorted_df = limit_df.sort_values(by=maximize, ascending=False, inplace=False)

    return sorted_df


#### 4.1 Filter examples

In [None]:
maximize = 'Precision'
limits = {
    'balanced_accuracy': .5,
    'Accuracy': .8,
    'roc_auc_score': .6
}

maximize = 'Accuracy'
limits = {
    'balanced_accuracy': .5,
    'Precision': .5,
    'roc_auc_score': .6
}

maximize = 'F1-score'
limits = {
    'Accuracy': .8,
    'Precision': .1,
    'roc_auc_score': .1
}

#### Filter - Accuracy

In [None]:
df.columns

Index(['dataset', 'model', 'slice', 'score', 'balanced_accuracy',
       'roc_auc_score', 'Mean Squared Error', 'Accuracy', 'Precision',
       'Recall', 'F1-score', 'Specificity', 'False Positive Rate',
       'Matthews Correlation Coefficient'],
      dtype='object')

In [None]:
df_print_len = 20

In [None]:
# Sort Dataframe by column to maximize
maximize = 'Accuracy'
limits = {
    'balanced_accuracy': .5,
    'Precision': .2
}

sorted_df1 = rank_results(df, maximize, limits).head(df_print_len)

sorted_df1

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
51,3 Binary Dataset,GradientBoostingClassifier,Test,0.8728,0.5738,0.8377,0.1272,0.8728,0.5587,0.1678,0.2581,0.9799,0.0201,0.2562
53,3 Binary Dataset,AdaBoostClassifier,Test,0.8727,0.5901,0.8344,0.1273,0.8727,0.5455,0.2063,0.2994,0.9739,0.0261,0.2801
121,8 SMOTEENN Dataset,GradientBoostingClassifier,Test,0.8723,0.5744,0.8356,0.1277,0.8723,0.581,0.1674,0.2599,0.9813,0.0187,0.263
47,3 Binary Dataset,RandomForestClassifier,Test,0.8719,0.5576,0.8262,0.1281,0.8719,0.5605,0.1308,0.2121,0.9844,0.0156,0.2258
123,8 SMOTEENN Dataset,AdaBoostClassifier,Test,0.8715,0.5849,0.8325,0.1285,0.8715,0.558,0.1935,0.2874,0.9763,0.0237,0.2748
117,8 SMOTEENN Dataset,RandomForestClassifier,Test,0.8711,0.557,0.8249,0.1289,0.8711,0.587,0.128,0.2102,0.9861,0.0139,0.2307
55,3 Binary Dataset,LogisticRegression,Test,0.8706,0.5674,0.8259,0.1294,0.8706,0.531,0.1556,0.2407,0.9791,0.0209,0.2365
49,3 Binary Dataset,ExtraTreesClassifier,Test,0.8699,0.5614,0.8163,0.1301,0.8699,0.5232,0.1426,0.2241,0.9803,0.0197,0.2233
119,8 SMOTEENN Dataset,ExtraTreesClassifier,Test,0.8697,0.5611,0.8189,0.1303,0.8697,0.5543,0.1395,0.2229,0.9827,0.0173,0.2305
125,8 SMOTEENN Dataset,LogisticRegression,Test,0.869,0.5659,0.8263,0.131,0.869,0.5388,0.1519,0.237,0.9799,0.0201,0.2355


In [None]:
maximize = 'Precision'
limits = {
    'balanced_accuracy': .5,
    'Accuracy': .8,
    'roc_auc_score': .5
}

sorted_df2 = rank_results(df, maximize, limits).head(df_print_len)

sorted_df2

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
117,8 SMOTEENN Dataset,RandomForestClassifier,Test,0.8711,0.557,0.8249,0.1289,0.8711,0.587,0.128,0.2102,0.9861,0.0139,0.2307
121,8 SMOTEENN Dataset,GradientBoostingClassifier,Test,0.8723,0.5744,0.8356,0.1277,0.8723,0.581,0.1674,0.2599,0.9813,0.0187,0.263
47,3 Binary Dataset,RandomForestClassifier,Test,0.8719,0.5576,0.8262,0.1281,0.8719,0.5605,0.1308,0.2121,0.9844,0.0156,0.2258
51,3 Binary Dataset,GradientBoostingClassifier,Test,0.8728,0.5738,0.8377,0.1272,0.8728,0.5587,0.1678,0.2581,0.9799,0.0201,0.2562
123,8 SMOTEENN Dataset,AdaBoostClassifier,Test,0.8715,0.5849,0.8325,0.1285,0.8715,0.558,0.1935,0.2874,0.9763,0.0237,0.2748
119,8 SMOTEENN Dataset,ExtraTreesClassifier,Test,0.8697,0.5611,0.8189,0.1303,0.8697,0.5543,0.1395,0.2229,0.9827,0.0173,0.2305
53,3 Binary Dataset,AdaBoostClassifier,Test,0.8727,0.5901,0.8344,0.1273,0.8727,0.5455,0.2063,0.2994,0.9739,0.0261,0.2801
125,8 SMOTEENN Dataset,LogisticRegression,Test,0.869,0.5659,0.8263,0.131,0.869,0.5388,0.1519,0.237,0.9799,0.0201,0.2355
77,5 RandomOverSampler Dataset,ExtraTreesClassifier,Test,0.8685,0.552,0.8191,0.1315,0.8685,0.538,0.1198,0.196,0.9841,0.0159,0.2081
55,3 Binary Dataset,LogisticRegression,Test,0.8706,0.5674,0.8259,0.1294,0.8706,0.531,0.1556,0.2407,0.9791,0.0209,0.2365


In [None]:
# Sort Dataframe by column to maximize
maximize = 'F1-score'
limits = {
    'Accuracy': .5,
    'Precision': .1,
    'roc_auc_score': .1
}


sorted_df3 = rank_results(df, maximize, limits).head(df_print_len)
sorted_df3

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
81,5 RandomOverSampler Dataset,AdaBoostClassifier,Test,0.7366,0.7572,0.8354,0.2634,0.7366,0.3091,0.7853,0.4436,0.729,0.271,0.3696
79,5 RandomOverSampler Dataset,GradientBoostingClassifier,Test,0.7271,0.7624,0.8398,0.2729,0.7271,0.3045,0.8107,0.4427,0.7142,0.2858,0.3731
67,4 RandomUnderSampler Dataset,AdaBoostClassifier,Test,0.7372,0.7547,0.8328,0.2628,0.7372,0.3052,0.7786,0.4385,0.7309,0.2691,0.3648
83,5 RandomOverSampler Dataset,LogisticRegression,Test,0.7299,0.7523,0.8276,0.2701,0.7299,0.3028,0.7828,0.4367,0.7217,0.2783,0.3611
65,4 RandomUnderSampler Dataset,GradientBoostingClassifier,Test,0.728,0.7574,0.8356,0.272,0.728,0.3,0.7972,0.4359,0.7175,0.2825,0.365
109,7 SMOTE Dataset,AdaBoostClassifier,Test,0.8011,0.704,0.8095,0.1989,0.8011,0.3479,0.5719,0.4326,0.8362,0.1638,0.3352
61,4 RandomUnderSampler Dataset,RandomForestClassifier,Test,0.7223,0.7559,0.8296,0.2777,0.7223,0.2958,0.8015,0.4321,0.7103,0.2897,0.3614
69,4 RandomUnderSampler Dataset,LogisticRegression,Test,0.7311,0.749,0.8247,0.2689,0.7311,0.299,0.7734,0.4313,0.7246,0.2754,0.3555
111,7 SMOTE Dataset,LogisticRegression,Test,0.7298,0.7436,0.8216,0.2702,0.7298,0.2975,0.7623,0.428,0.7249,0.2751,0.3488
63,4 RandomUnderSampler Dataset,ExtraTreesClassifier,Test,0.7181,0.7486,0.8235,0.2819,0.7181,0.2906,0.79,0.4249,0.7072,0.2928,0.3507


In [None]:
# Find all rows that exist in all three filtered results.
df_columns = list(sorted_df2.columns)
merged_df = pd.merge(sorted_df1, sorted_df2, on=df_columns, how='inner')
merge_df2 = pd.merge(merged_df, sorted_df3, on=df_columns, how='inner')

merge_df2

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
0,5 RandomOverSampler Dataset,RandomForestClassifier,Test,0.866,0.6017,0.828,0.134,0.866,0.4976,0.2409,0.3246,0.9625,0.0375,0.2813
1,7 SMOTE Dataset,RandomForestClassifier,Test,0.8656,0.5999,0.8212,0.1344,0.8656,0.4864,0.2383,0.3199,0.9615,0.0385,0.275
2,7 SMOTE Dataset,ExtraTreesClassifier,Test,0.8624,0.6092,0.8163,0.1376,0.8624,0.4665,0.2647,0.3378,0.9537,0.0463,0.2809
3,7 SMOTE Dataset,GradientBoostingClassifier,Test,0.8499,0.6597,0.8216,0.1501,0.8499,0.4292,0.401,0.4146,0.9185,0.0815,0.3288
