---

# Process the Performance Report

---

In [1]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib
import config
import time

sys.path.insert(1, config.package_path)
import ml_analysis as mlanlys

In [2]:
start_time = time.time()

In [3]:
# Path to performance report
optimization_report         = config.optimization_report
optimization_report_text_file = config.optimization_report_text_file

---

## 1. Read Optimization report

---

In [4]:
if os.path.exists(optimization_report):
    # Load Performance Report
    with open(optimization_report, 'rb') as file: perf_report = pickle.load(file)
    print(f"The file {optimization_report} exists and the Performance Report was read successfully")
else:
    print(f"******************************************************")
    print(f"The file: '{optimization_report}' DOES NOT EXIST")
    print(f"******************************************************")

The file optimize/optimization_report.pkl exists and the Performance Report was read successfully


---

## 2. Write the Performance Report

---

In [5]:

perf_datasets_in = perf_report.keys()

perf_datasets_in

dict_keys(['RandomizedSearchCV_binary_standard_undersample_DecisionTreeClassifier', 'RandomizedSearchCV_binary_standard_undersample_LogisticRegression', 'RandomizedSearchCV_binary_standard_undersample_GradientBoostingClassifier', 'RandomizedSearchCV_binary_standard_undersample_AdaBoostClassifier', 'RandomizedSearchCV_binary_standard_undersample_RandomForestClassifier'])

#### 2.1 Modify the order of the reported datasets

---

In [6]:
# Use this cell to re-order the datasets for the report

perf_datasets = perf_datasets_in

In [7]:
# Remove bad datasets if needed
#del perf_report['Base Dataset']

#### 2.2 Print the report

---

In [8]:
def print_performance_report(perf_report):
    # Print header and then every performance report in the reports dictionary

    print(f"**********************************************************")
    print(f"Diabetes Predictions Optimization Summary Report")
    print(f"**********************************************************")

    print(f"----------------------------------------")
    print(f"Report Summary")
    print(f"----------------------------------------")

    report_keys = list(perf_report.keys())
    report_keys.sort()
    
    for report in report_keys:
        print(f"Performance for: {report}")

    print(f"----------------------------------------")

    for report in report_keys:
        report_dict = perf_report[report]
        dataset_size = report_dict['dataset_size']
        report_df = report_dict['report']
        report_df.rename(columns={'new_column':'dataset'}, inplace=True)

        print(f"----------------------------------------")
        print(f"Performance for: {report}")
        print(f"Dataset Size:    {dataset_size[0]} Rows, {dataset_size[1]} Columns")
        print(f"----------------------------------------")
        print(f"{report_df.to_string(index=False)}")

In [9]:
print_performance_report(perf_report)

**********************************************************
Diabetes Predictions Optimization Summary Report
**********************************************************
----------------------------------------
Report Summary
----------------------------------------
Performance for: RandomizedSearchCV_binary_standard_undersample_AdaBoostClassifier
Performance for: RandomizedSearchCV_binary_standard_undersample_DecisionTreeClassifier
Performance for: RandomizedSearchCV_binary_standard_undersample_GradientBoostingClassifier
Performance for: RandomizedSearchCV_binary_standard_undersample_LogisticRegression
Performance for: RandomizedSearchCV_binary_standard_undersample_RandomForestClassifier
----------------------------------------
----------------------------------------
Performance for: RandomizedSearchCV_binary_standard_undersample_AdaBoostClassifier
Dataset Size:    2 Rows, 16 Columns
----------------------------------------
                    dataset              model        slice  sc

In [10]:
# Generate the Performance Report and send prints to osc.stdout
with mlanlys.OutStreamCapture() as osc:
    print_performance_report(perf_report)

In [12]:
# osc.stdout contains the details of the performance report
# write the performance report to the performance_report_text_file
with open(optimization_report_text_file, "w") as file:
    file.write(osc.stdout)

In [None]:
print(f"Completed: Execution Time %s seconds:" % round((time.time() - start_time),2) )

Completed: Execution Time 0.15 seconds:


## 3. Merge Performance Metrics into a single dataframe

In [14]:
# Concatenate all the Reports into one dataframe
df = pd.DataFrame()

report_keys = list(perf_report.keys())
report_keys.sort()

for report in report_keys:
    report_dict = perf_report[report]
    report_df = report_dict['report']
    df = pd.concat([df, report_df], ignore_index=True)

In [15]:
df

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient,Optimizer,best_parameters
0,binary_standard_undersample,AdaBoostClassifier,un-optimized,0.7298,0.7502,0.8286,0.2702,0.7298,0.3051,0.7782,0.4383,0.7222,0.2778,0.3601,RandomizedSearchCV,
1,binary_standard_undersample,AdaBoostClassifier,optimized,0.7386,0.7572,0.8347,0.2614,0.7386,0.3137,0.7829,0.4479,0.7316,0.2684,0.3722,RandomizedSearchCV,"{'n_estimators': 200, 'learning_rate': 1.0, 'e..."
2,binary_standard_undersample,DecisionTreeClassifier,un-optimized,0.6662,0.6606,0.6606,0.3338,0.6662,0.2357,0.6529,0.3464,0.6683,0.3317,0.227,RandomizedSearchCV,
3,binary_standard_undersample,DecisionTreeClassifier,optimized,0.7162,0.7327,0.7964,0.2838,0.7162,0.2899,0.7552,0.419,0.7101,0.2899,0.3333,RandomizedSearchCV,"{'min_samples_split': 5, 'min_samples_leaf': 2..."
4,binary_standard_undersample,GradientBoostingClassifier,un-optimized,0.7294,0.7593,0.8366,0.2706,0.7294,0.308,0.8003,0.4448,0.7182,0.2818,0.3716,RandomizedSearchCV,
5,binary_standard_undersample,GradientBoostingClassifier,optimized,0.7291,0.7598,0.8382,0.2709,0.7291,0.308,0.8019,0.4451,0.7177,0.2823,0.3722,RandomizedSearchCV,"{'subsample': 0.6346103310437, 'n_estimators':..."
6,binary_standard_undersample,LogisticRegression,un-optimized,0.7321,0.7502,0.8256,0.2679,0.7321,0.3066,0.7751,0.4394,0.7253,0.2747,0.3609,RandomizedSearchCV,
7,binary_standard_undersample,LogisticRegression,optimized,0.7319,0.75,0.8257,0.2681,0.7319,0.3065,0.7748,0.4392,0.7252,0.2748,0.3606,RandomizedSearchCV,"{'C': 0.027808522124762813, 'penalty': 'l2', '..."
8,binary_standard_undersample,RandomForestClassifier,un-optimized,0.7214,0.7551,0.8301,0.2786,0.7214,0.301,0.8014,0.4376,0.7089,0.2911,0.3635,RandomizedSearchCV,
9,binary_standard_undersample,RandomForestClassifier,optimized,0.7171,0.7596,0.8346,0.2829,0.7171,0.2998,0.8178,0.4388,0.7013,0.2987,0.368,RandomizedSearchCV,"{'n_estimators': 982, 'min_samples_split': 10,..."


## 4. Filter and sort test results

In [23]:
def rank_results(df, maximize, limits):
    # Sort Dataframe by column to maximize

    # Create dataframe with only the test metrics
#    test_df = df[df['slice'] == 'Test']
    limit_df = df.copy()

    # limit the rows based on the values in the limits dictionary above
    for limit in limits:
        # print(limit)
        limit_value = limits[limit]
        limit_df = limit_df[limit_df[limit]>limit_value]

    # sort by column_to_maximize fro highest to lowest
    sorted_df = limit_df.sort_values(by=maximize, ascending=False, inplace=False)

    return sorted_df


#### 4.1 Filter examples

#### Filter - Accuracy

In [18]:
df.columns

Index(['dataset', 'model', 'slice', 'score', 'balanced_accuracy',
       'roc_auc_score', 'Mean Squared Error', 'Accuracy', 'Precision',
       'Recall', 'F1-score', 'Specificity', 'False Positive Rate',
       'Matthews Correlation Coefficient', 'Optimizer', 'best_parameters'],
      dtype='object')

In [19]:
df_print_len = 20

In [24]:
# Sort Dataframe by column to maximize
maximize = 'Accuracy'
limits = {
    'balanced_accuracy': .2,
    'Precision': .1
}

sorted_df1 = rank_results(df, maximize, limits).head(df_print_len)

sorted_df1

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient,Optimizer,best_parameters
1,binary_standard_undersample,AdaBoostClassifier,optimized,0.7386,0.7572,0.8347,0.2614,0.7386,0.3137,0.7829,0.4479,0.7316,0.2684,0.3722,RandomizedSearchCV,"{'n_estimators': 200, 'learning_rate': 1.0, 'e..."
6,binary_standard_undersample,LogisticRegression,un-optimized,0.7321,0.7502,0.8256,0.2679,0.7321,0.3066,0.7751,0.4394,0.7253,0.2747,0.3609,RandomizedSearchCV,
7,binary_standard_undersample,LogisticRegression,optimized,0.7319,0.75,0.8257,0.2681,0.7319,0.3065,0.7748,0.4392,0.7252,0.2748,0.3606,RandomizedSearchCV,"{'C': 0.027808522124762813, 'penalty': 'l2', '..."
0,binary_standard_undersample,AdaBoostClassifier,un-optimized,0.7298,0.7502,0.8286,0.2702,0.7298,0.3051,0.7782,0.4383,0.7222,0.2778,0.3601,RandomizedSearchCV,
4,binary_standard_undersample,GradientBoostingClassifier,un-optimized,0.7294,0.7593,0.8366,0.2706,0.7294,0.308,0.8003,0.4448,0.7182,0.2818,0.3716,RandomizedSearchCV,
5,binary_standard_undersample,GradientBoostingClassifier,optimized,0.7291,0.7598,0.8382,0.2709,0.7291,0.308,0.8019,0.4451,0.7177,0.2823,0.3722,RandomizedSearchCV,"{'subsample': 0.6346103310437, 'n_estimators':..."
8,binary_standard_undersample,RandomForestClassifier,un-optimized,0.7214,0.7551,0.8301,0.2786,0.7214,0.301,0.8014,0.4376,0.7089,0.2911,0.3635,RandomizedSearchCV,
9,binary_standard_undersample,RandomForestClassifier,optimized,0.7171,0.7596,0.8346,0.2829,0.7171,0.2998,0.8178,0.4388,0.7013,0.2987,0.368,RandomizedSearchCV,"{'n_estimators': 982, 'min_samples_split': 10,..."
3,binary_standard_undersample,DecisionTreeClassifier,optimized,0.7162,0.7327,0.7964,0.2838,0.7162,0.2899,0.7552,0.419,0.7101,0.2899,0.3333,RandomizedSearchCV,"{'min_samples_split': 5, 'min_samples_leaf': 2..."
2,binary_standard_undersample,DecisionTreeClassifier,un-optimized,0.6662,0.6606,0.6606,0.3338,0.6662,0.2357,0.6529,0.3464,0.6683,0.3317,0.227,RandomizedSearchCV,


In [26]:
maximize = 'Precision'
limits = {
    'balanced_accuracy': .5,
    'Accuracy': .6,
    'roc_auc_score': .5
}

sorted_df2 = rank_results(df, maximize, limits).head(df_print_len)

sorted_df2

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient,Optimizer,best_parameters
1,binary_standard_undersample,AdaBoostClassifier,optimized,0.7386,0.7572,0.8347,0.2614,0.7386,0.3137,0.7829,0.4479,0.7316,0.2684,0.3722,RandomizedSearchCV,"{'n_estimators': 200, 'learning_rate': 1.0, 'e..."
4,binary_standard_undersample,GradientBoostingClassifier,un-optimized,0.7294,0.7593,0.8366,0.2706,0.7294,0.308,0.8003,0.4448,0.7182,0.2818,0.3716,RandomizedSearchCV,
5,binary_standard_undersample,GradientBoostingClassifier,optimized,0.7291,0.7598,0.8382,0.2709,0.7291,0.308,0.8019,0.4451,0.7177,0.2823,0.3722,RandomizedSearchCV,"{'subsample': 0.6346103310437, 'n_estimators':..."
6,binary_standard_undersample,LogisticRegression,un-optimized,0.7321,0.7502,0.8256,0.2679,0.7321,0.3066,0.7751,0.4394,0.7253,0.2747,0.3609,RandomizedSearchCV,
7,binary_standard_undersample,LogisticRegression,optimized,0.7319,0.75,0.8257,0.2681,0.7319,0.3065,0.7748,0.4392,0.7252,0.2748,0.3606,RandomizedSearchCV,"{'C': 0.027808522124762813, 'penalty': 'l2', '..."
0,binary_standard_undersample,AdaBoostClassifier,un-optimized,0.7298,0.7502,0.8286,0.2702,0.7298,0.3051,0.7782,0.4383,0.7222,0.2778,0.3601,RandomizedSearchCV,
8,binary_standard_undersample,RandomForestClassifier,un-optimized,0.7214,0.7551,0.8301,0.2786,0.7214,0.301,0.8014,0.4376,0.7089,0.2911,0.3635,RandomizedSearchCV,
9,binary_standard_undersample,RandomForestClassifier,optimized,0.7171,0.7596,0.8346,0.2829,0.7171,0.2998,0.8178,0.4388,0.7013,0.2987,0.368,RandomizedSearchCV,"{'n_estimators': 982, 'min_samples_split': 10,..."
3,binary_standard_undersample,DecisionTreeClassifier,optimized,0.7162,0.7327,0.7964,0.2838,0.7162,0.2899,0.7552,0.419,0.7101,0.2899,0.3333,RandomizedSearchCV,"{'min_samples_split': 5, 'min_samples_leaf': 2..."
2,binary_standard_undersample,DecisionTreeClassifier,un-optimized,0.6662,0.6606,0.6606,0.3338,0.6662,0.2357,0.6529,0.3464,0.6683,0.3317,0.227,RandomizedSearchCV,


In [27]:
# Sort Dataframe by column to maximize
maximize = 'F1-score'
limits = {
    'Accuracy': .5,
    'Precision': .1,
    'roc_auc_score': .1
}


sorted_df3 = rank_results(df, maximize, limits).head(df_print_len)
sorted_df3

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient,Optimizer,best_parameters
1,binary_standard_undersample,AdaBoostClassifier,optimized,0.7386,0.7572,0.8347,0.2614,0.7386,0.3137,0.7829,0.4479,0.7316,0.2684,0.3722,RandomizedSearchCV,"{'n_estimators': 200, 'learning_rate': 1.0, 'e..."
5,binary_standard_undersample,GradientBoostingClassifier,optimized,0.7291,0.7598,0.8382,0.2709,0.7291,0.308,0.8019,0.4451,0.7177,0.2823,0.3722,RandomizedSearchCV,"{'subsample': 0.6346103310437, 'n_estimators':..."
4,binary_standard_undersample,GradientBoostingClassifier,un-optimized,0.7294,0.7593,0.8366,0.2706,0.7294,0.308,0.8003,0.4448,0.7182,0.2818,0.3716,RandomizedSearchCV,
6,binary_standard_undersample,LogisticRegression,un-optimized,0.7321,0.7502,0.8256,0.2679,0.7321,0.3066,0.7751,0.4394,0.7253,0.2747,0.3609,RandomizedSearchCV,
7,binary_standard_undersample,LogisticRegression,optimized,0.7319,0.75,0.8257,0.2681,0.7319,0.3065,0.7748,0.4392,0.7252,0.2748,0.3606,RandomizedSearchCV,"{'C': 0.027808522124762813, 'penalty': 'l2', '..."
9,binary_standard_undersample,RandomForestClassifier,optimized,0.7171,0.7596,0.8346,0.2829,0.7171,0.2998,0.8178,0.4388,0.7013,0.2987,0.368,RandomizedSearchCV,"{'n_estimators': 982, 'min_samples_split': 10,..."
0,binary_standard_undersample,AdaBoostClassifier,un-optimized,0.7298,0.7502,0.8286,0.2702,0.7298,0.3051,0.7782,0.4383,0.7222,0.2778,0.3601,RandomizedSearchCV,
8,binary_standard_undersample,RandomForestClassifier,un-optimized,0.7214,0.7551,0.8301,0.2786,0.7214,0.301,0.8014,0.4376,0.7089,0.2911,0.3635,RandomizedSearchCV,
3,binary_standard_undersample,DecisionTreeClassifier,optimized,0.7162,0.7327,0.7964,0.2838,0.7162,0.2899,0.7552,0.419,0.7101,0.2899,0.3333,RandomizedSearchCV,"{'min_samples_split': 5, 'min_samples_leaf': 2..."
2,binary_standard_undersample,DecisionTreeClassifier,un-optimized,0.6662,0.6606,0.6606,0.3338,0.6662,0.2357,0.6529,0.3464,0.6683,0.3317,0.227,RandomizedSearchCV,


In [29]:
# # Find all rows that exist in all three filtered results.
# df_columns = list(sorted_df2.columns)
# merged_df = pd.merge(sorted_df1, sorted_df2, on=df_columns, how='inner')
# merge_df2 = pd.merge(merged_df, sorted_df3, on=df_columns, how='inner')

# merge_df2