---

# Process the Performance Report

---

In [83]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib
import config
import time

sys.path.insert(1, config.package_path)
import ml_analysis as mlanlys

In [84]:
start_time = time.time()

In [85]:
# Path to performance report
report_path = 'reports/'
performance_report              = report_path + 'performance_report.pkl'
performance_report_text_file    = report_path + 'performance_report.txt'

---

## 1. Read performance report

---

In [86]:
if os.path.exists(performance_report):
    # Load Performance Report
    with open(performance_report, 'rb') as file: perf_report = pickle.load(file)
    print(f"The file {performance_report} exists and the Performance Report was read successfully")
else:
    print(f"******************************************************")
    print(f"The file: '{performance_report}' DOES NOT EXIST")
    print(f"******************************************************")

The file reports/performance_report.pkl exists and the Performance Report was read successfully


---

## 2. Write the Performance Report

---

In [87]:

perf_datasets_in = perf_report.keys()

perf_datasets_in

dict_keys(['2.0 StandardScaler Dataset', '2.1 MinMaxScaler Dataset', '3 Binary Dataset', '4 RandomUnderSampler Dataset', '5 RandomOverSampler Dataset', '7 SMOTE Dataset', '8 SMOTEENN Dataset', '1 Base Dataset', '6 ClusterCentroids Dataset'])

#### 2.1 Modify the order of the reported datasets

---

In [88]:
# Use this cell to re-order the datasets for the report

perf_datasets = perf_datasets_in

#### 2.2 Print the report

---

In [89]:
def print_performance_report(perf_report):
    # Print header and then every performance report in the reports dictionary

    print(f"**********************************************************")
    print(f"Diabetes Predictions Performance Summary Report")
    print(f"**********************************************************")

    print(f"----------------------------------------")
    print(f"Report Summary")
    print(f"----------------------------------------")

    for report in perf_report:
        print(f"Performance for: {report}")

    print(f"----------------------------------------")

    for report in perf_report:
        report_dict = perf_report[report]
        dataset_size = report_dict['dataset_size']
        report_df = report_dict['report']
        print(f"----------------------------------------")
        print(f"Performance for: {report}")
        print(f"Dataset Size:    {dataset_size[0]} Rows, {dataset_size[1]} Columns")
        print(f"----------------------------------------")
        print(f"{report_df.to_string(index=False)}")

In [90]:
# Generate the Performance Report and send prints to osc.stdout
with mlanlys.OutStreamCapture() as osc:
    print_performance_report(perf_report)

In [91]:
# osc.stdout contains the details of the performance report
# write the performance report to the performance_report_text_file
with open(performance_report_text_file, "w") as file:
    file.write(osc.stdout)

In [92]:
# Display the performance report here:
print(osc.stdout)

**********************************************************
Diabetes Predictions Performance Summary Report
**********************************************************
----------------------------------------
Report Summary
----------------------------------------
Performance for: 2.0 StandardScaler Dataset
Performance for: 2.1 MinMaxScaler Dataset
Performance for: 3 Binary Dataset
Performance for: 4 RandomUnderSampler Dataset
Performance for: 5 RandomOverSampler Dataset
Performance for: 7 SMOTE Dataset
Performance for: 8 SMOTEENN Dataset
Performance for: 1 Base Dataset
Performance for: 6 ClusterCentroids Dataset
----------------------------------------
----------------------------------------
Performance for: 2.0 StandardScaler Dataset
Dataset Size:    253680 Rows, 22 Columns
----------------------------------------
                new_column                      model slice  score  balanced_accuracy  roc_auc_score  Mean Squared Error  Accuracy  Precision  Recall  F1-score  Specificity 

In [93]:
print(f"Completed: Execution Time %s seconds:" % round((time.time() - start_time),2) )

Completed: Execution Time 0.23 seconds:


In [94]:
# Concatenate all the Reports into one dataframe
df = pd.DataFrame()

for report in perf_report:
    report_dict = perf_report[report]
    report_df = report_dict['report']
    df = pd.concat([df, report_df], ignore_index=True)

In [95]:
df.rename(columns={'new_column':'dataset'}, inplace=True)
df

Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
0,2.0 StandardScaler Dataset,KNeighborsClassifier,Train,0.8866,0.5038,0.9481,0.4014,0.9027,0.7260,0.4801,0.5780,0.9708,0.0292,
1,2.0 StandardScaler Dataset,KNeighborsClassifier,Test,0.8194,0.3917,0.6253,0.6637,0.8355,0.3696,0.2378,0.2894,0.9335,0.0665,0.2076
2,2.0 StandardScaler Dataset,DecisionTreeClassifier,Train,0.9934,0.9684,0.9999,0.0230,0.9944,0.9993,0.9606,0.9796,0.9999,0.0001,
3,2.0 StandardScaler Dataset,DecisionTreeClassifier,Test,0.7651,0.4030,0.5675,0.8117,0.7977,0.2968,0.3190,0.3075,0.8761,0.1239,0.1893
4,2.0 StandardScaler Dataset,RandomForestClassifier,Train,0.9934,0.9689,0.9995,0.0231,0.9944,0.9944,0.9651,0.9795,0.9991,0.0009,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,6 ClusterCentroids Dataset,GradientBoostingClassifier,Test,0.3397,0.5752,0.6168,0.6603,0.3397,0.1628,0.9016,0.2758,0.2487,0.7513,0.1242
122,6 ClusterCentroids Dataset,AdaBoostClassifier,Train,0.8580,0.8580,0.9364,0.1420,0.8580,0.8335,0.8948,0.8631,0.8213,0.1787,0.7180
123,6 ClusterCentroids Dataset,AdaBoostClassifier,Test,0.4017,0.6062,0.6695,0.5983,0.4017,0.1755,0.8898,0.2932,0.3227,0.6773,0.1617
124,6 ClusterCentroids Dataset,LogisticRegression,Train,0.7159,0.7159,0.7872,0.2841,0.7159,0.6986,0.7595,0.7278,0.6723,0.3277,0.4335


In [96]:
df.columns

Index(['dataset', 'model', 'slice', 'score', 'balanced_accuracy',
       'roc_auc_score', 'Mean Squared Error', 'Accuracy', 'Precision',
       'Recall', 'F1-score', 'Specificity', 'False Positive Rate',
       'Matthews Correlation Coefficient'],
      dtype='object')

In [97]:
# Create dataframe with only the test metrics
test_df = df[df['slice'] == 'Test']

In [98]:
# copy the test_df so it can be reused later
limit_df = test_df.copy()

# Sort Dataframe by column to maximize
# Score requirments: 75% classification accuracy or 0.80 R-squared
maximize = 'Precision'
limits = {
    'balanced_accuracy': .5,
    'Accuracy': .8,
    'roc_auc_score': .6
}

# limit the rows based on the values in the limits dictionary above
for limit in limits:
    print(limit)
    limit_value = limits[limit]
    limit_df = limit_df[limit_df[limit]>limit_value]

# sort by maximize fro highest to lowest
sorted_df = limit_df.sort_values(by=maximize, ascending=False, inplace=False)

sorted_df.head(10)

balanced_accuracy
Accuracy
roc_auc_score


Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
37,3 Binary Dataset,GradientBoostingClassifier,Test,0.867,0.5707,0.8301,0.133,0.867,0.5616,0.1617,0.2511,0.9798,0.0202,0.2498
93,8 SMOTEENN Dataset,GradientBoostingClassifier,Test,0.8668,0.5705,0.8294,0.1332,0.8668,0.5562,0.1615,0.2503,0.9794,0.0206,0.2478
39,3 Binary Dataset,AdaBoostClassifier,Test,0.8663,0.5837,0.8269,0.1337,0.8663,0.5428,0.1935,0.2853,0.9739,0.0261,0.267
41,3 Binary Dataset,LogisticRegression,Test,0.8649,0.5684,0.8232,0.1351,0.8649,0.5344,0.1589,0.245,0.9779,0.0221,0.2378
95,8 SMOTEENN Dataset,AdaBoostClassifier,Test,0.8655,0.5822,0.8259,0.1345,0.8655,0.532,0.1913,0.2814,0.9731,0.0269,0.2612
97,8 SMOTEENN Dataset,LogisticRegression,Test,0.8641,0.5665,0.8217,0.1359,0.8641,0.5219,0.1559,0.2401,0.9772,0.0228,0.2309
89,8 SMOTEENN Dataset,RandomForestClassifier,Test,0.8602,0.572,0.7991,0.1398,0.8602,0.4794,0.1742,0.2555,0.9698,0.0302,0.2276
33,3 Binary Dataset,RandomForestClassifier,Test,0.86,0.5712,0.799,0.14,0.86,0.4786,0.1725,0.2536,0.9699,0.0301,0.226
75,7 SMOTE Dataset,RandomForestClassifier,Test,0.8518,0.6126,0.798,0.1482,0.8518,0.4466,0.2815,0.3453,0.9437,0.0563,0.2756
79,7 SMOTE Dataset,GradientBoostingClassifier,Test,0.8394,0.6811,0.8213,0.1606,0.8394,0.4277,0.4619,0.4441,0.9003,0.0997,0.3508


In [99]:
# copy the test_df so it can be reused later
limit_df = test_df.copy()

# Sort Dataframe by column to maximize
# Score requirments: 75% classification accuracy or 0.80 R-squared
maximize = 'Accuracy'
limits = {
    'balanced_accuracy': .5,
    'Precision': .5,
    'roc_auc_score': .6
}

# limit the rows based on the values in the limits dictionary above
for limit in limits:
    print(limit)
    limit_value = limits[limit]
    limit_df = limit_df[limit_df[limit]>limit_value]

# sort by maximize fro highest to lowest
sorted_df = limit_df.sort_values(by=maximize, ascending=False, inplace=False)

sorted_df.head(10)

balanced_accuracy
Precision
roc_auc_score


Unnamed: 0,dataset,model,slice,score,balanced_accuracy,roc_auc_score,Mean Squared Error,Accuracy,Precision,Recall,F1-score,Specificity,False Positive Rate,Matthews Correlation Coefficient
37,3 Binary Dataset,GradientBoostingClassifier,Test,0.867,0.5707,0.8301,0.133,0.867,0.5616,0.1617,0.2511,0.9798,0.0202,0.2498
93,8 SMOTEENN Dataset,GradientBoostingClassifier,Test,0.8668,0.5705,0.8294,0.1332,0.8668,0.5562,0.1615,0.2503,0.9794,0.0206,0.2478
39,3 Binary Dataset,AdaBoostClassifier,Test,0.8663,0.5837,0.8269,0.1337,0.8663,0.5428,0.1935,0.2853,0.9739,0.0261,0.267
95,8 SMOTEENN Dataset,AdaBoostClassifier,Test,0.8655,0.5822,0.8259,0.1345,0.8655,0.532,0.1913,0.2814,0.9731,0.0269,0.2612
41,3 Binary Dataset,LogisticRegression,Test,0.8649,0.5684,0.8232,0.1351,0.8649,0.5344,0.1589,0.245,0.9779,0.0221,0.2378
97,8 SMOTEENN Dataset,LogisticRegression,Test,0.8641,0.5665,0.8217,0.1359,0.8641,0.5219,0.1559,0.2401,0.9772,0.0228,0.2309


## Conclusions

Based on the rankings above, the best models were:
- GradientBoostingClassifier
- AdaBoostClassifier
- LogisticRegression

The best datasets were:
- Binary dataset
- Binary, Standard Scalar & SMOTEEN sampling.

However, the binary dataset performed best.
