---

## Data Analysis

- This file differs from [2_data_analysis_1_base_data.ipynb](2_data_analysis_1_base_data.ipynb) in that it:
    - scales the base cleaned data created in [1_data_cleaning.ipynb](1_data_cleaning.ipynb).

Source dataset: 247076 rows × 37 columns
Processed and analyzed dataset: 247076 rows × 37 columns

---

In [50]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib
import config

sys.path.insert(1, config.package_path)
import ml_analysis as mlanlys
import ml_clean_feature as mlclean

---

## 1. Read the cleaned dataset from file

---

In [51]:
# reload any changes to Config Settings
importlib.reload(config)

# BE SURE TO UPDATE THE LABEL FOR THIS ANALYSIS
# #############################
dataset_label = '5 RandomOverSampler Dataset'
# #############################

year                        = config.year

clean_file                  = config.clean_file
performance_report          = config.performance_report

report_path                 = config.report_path
file_label                  = dataset_label.lower().replace(' ','_')
detailed_performance_report = report_path + file_label + '_detailed_performance_report.txt'

print(f"Year:                        {year}")
print(f"Clean File:                  {clean_file}")
print(f"Performance Report:          {performance_report}")
print(f"Detailed Performance Report: {detailed_performance_report}")

Year:                        2015
Clean File:                  data/brfss_2015_clean.parquet.gzip
Performance Report:          reports/performance_report.pkl
Detailed Performance Report: reports/5_randomoversampler_dataset_detailed_performance_report.txt


In [52]:
# Read final cleaned dataset from parquet file
df = pd.read_parquet(clean_file, engine="fastparquet")

In [53]:
diabetes_labels = df.columns

In [54]:
df.shape

(253680, 22)

---

## 2. Prepare the dataset for analysis

- Split the dataset into features and labels.
- Split the dataset into training and testing sets.
- Scale the dataset

---

In [55]:
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [56]:
# reload any changes to mlanlys
importlib.reload(mlanlys)

target = 'diabetes'
# Dictionary defining modification to be made to the base dataset
operation_dict = {  'target_column'     :  target,
                    'convert_to_binary' :  True,
                    'scaler'            : 'standard', # options: none, standard, minmax
                    'random_sample'     : 'oversample'      # options: none, undersample, oversample
                    }

# This insures that df if not modified during the call to modify_base_dataset()
df_modified = df.copy()

# Modify the base dataset
# data is returned where: X_train, X_test, y_train, y_test = data
data = mlanlys.modify_base_dataset(df_modified, operation_dict)

Base Dataset Modifications in Process
-------------------------------------
**Operation:target_column  diabetes
**Operation:convert_to_binary  True
  -- Converting dataset to binary (0,1) from (0,1,2)


****Cleaning Feature: diabetes
  Initial Unique features in [diabetes]:  [0. 1. 2.]
  values_to_drop: ********* NO Parameters were specified *********
  translate: {1: 0, 2: 1}
  scale: ********* NO Parameters were specified *********
  FINAL Unique features in [diabetes]:  [0. 1.]
**Operation:scaler  standard
  -- Performing train_test_split on dataframe with target:'diabetes'
     -- Run automatically before scalar or random_sample operations
  -- Performing StandardScaler on X_train: Updates X_train, y_test
**Operation:random_sample  oversample
  -- Performing RandomOverSampler on X_train, y_train: Updates X_train, y_train

Dataframe, Train Test Summary
-----------------------------
Dataframe: (253680, 22)  Data:4, X_train:327526, y_train:327526, X_test:63420, y_test:63420
ValueCount

In [57]:
# Print some statistics about the original df and the modified dataframe
print(f"Original Dataframe")
print(f"------------------")
print(f"df.shape: {df.shape}")
print(f"df[{target}].value_counts:  {df[target].value_counts()}")

print(f"\nModified Dataframe")
print(f"------------------")
print(f"df_modified.shape: {df_modified.shape}")
print(f"df_modified[{target}].value_counts:  {df_modified[target].value_counts()}")

Original Dataframe
------------------
df.shape: (253680, 22)
df[diabetes].value_counts:  diabetes
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64

Modified Dataframe
------------------
df_modified.shape: (253680, 22)
df_modified[diabetes].value_counts:  diabetes
0.0    218334
1.0     35346
Name: count, dtype: int64


In [58]:
X_train, X_test, y_train, y_test = data
print(f"Dataframe: {df_modified.shape}  Data:{len(data)}, X_train:{len(X_train)}, y_train:{len(y_train)}, X_test:{len(X_test)}, y_test:{len(y_test)}")
y_train.value_counts()

Dataframe: (253680, 22)  Data:4, X_train:327526, y_train:327526, X_test:63420, y_test:63420


diabetes
0.0    163763
1.0    163763
Name: count, dtype: int64

---

## 3. Prototype reporting


---

In [59]:
def model_performance_metrics(model, data, datalabel):
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    from sklearn.metrics import balanced_accuracy_score
    from sklearn.metrics import roc_auc_score    
    # Expand the model data
    X, y, y_pred = data

    # -------------------------------------- Model Performance
    print(f'------------------------------------------------------------------------')
    print(f'---------- {datalabel}ing Data Performance\n------------------------------------------------------------------------')
    # -----  Create a confusion matrix
    conf_mat = confusion_matrix(y, y_pred)
    print(f"Confusion Matrix\n{conf_mat}")
#        print(f"Confusion Matrix\n{confusion_matrix(y, y_pred, labels = [1,0])}")
    
    # -----  Score
    score_model = round( model.score(X, y), 4)
    print(f'\n-----------------------\n{datalabel} score: {score_model}')

    # -----  Balanced Accuracy
    score_ba = round( balanced_accuracy_score(y, y_pred), 4)
    print(f"Balanced Accuracy Score: {score_ba}")

    # -----  ROC AUC Score
    if len(y.value_counts())>2:
        score_roc_auc = round( roc_auc_score(y, model.predict_proba(X), multi_class='ovr'), 4)
    else:
        score_roc_auc = round( roc_auc_score(y, model.predict_proba(X)[:, 1]), 4)

    print(f"ROC AUC Score: {score_roc_auc}")


    mse = round( mean_squared_error(y, y_pred), 4)

    print(f"Mean Squared Error: {mse}")

    # -----  Create a classification report
    print(f"\n-----------------------\nClassification Report\n{classification_report(y, y_pred)}")

    # ----- metrics caluculated from the Confusion Matrix
    # Unravel the confusion matrix
    tn, fp, fn, tp = conf_mat.ravel()

    print(f"------------------------------")
    print(f"--- Classification values")
    print(f"------------------------------")
    # Computer acuracy
    accuracy = round( ( (tp + tn) / (tp + tn + fp + fn) ),4)
    print("Accuracy:", accuracy)

    precision = round( (tp / (tp + fp) ),4)
    print("Precision:", precision)

    recall = round( (tp / (tp + fn) ),4)
    print("Recall:", recall)

    f1 = round( (2 * (precision * recall) / (precision + recall) ),4)
    print("F1-score:", f1)

    specificity = round( (tn / (tn + fp) ),4)
    print("Specificity:", specificity)

    fpr = round( (fp / (fp + tn) ),4)
    print("False Positive Rate:", fpr)

    mcc = round( ( (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5) ),4)
    print("Matthews Correlation Coefficient:", mcc)

    return {'model': type(model).__name__, 
            'slice': datalabel,
            'score':score_model, 
            'balanced_accuracy': score_ba, 
            'roc_auc_score':score_roc_auc,
            "Mean Squared Error": mse,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'Specificity': specificity,
            'False Positive Rate': fpr,
            'Matthews Correlation Coefficient': mcc
            }

In [60]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [61]:
# reload any changes to mlanlys
importlib.reload(mlanlys)

<module 'ml_analysis' from '/mnt/c/ML/DU/repos/projects/project-2/DU-project-2-2015/prototype_jeff/brfss_2015/../../pkgs/ml_analysis.py'>

In [62]:
# NOTE: X_train, X_test, y_train, y_test = data already performed at the end of step 2

In [63]:
model = DecisionTreeClassifier()
# Train the model
model = model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

test_data = [ X_test, y_test, y_test_pred]



In [65]:
# reload any changes to mlanlys
importlib.reload(mlanlys)

mymetrics = mlanlys.model_performance_metrics(model, test_data, "Original Decision Tree Classifier" )

print(mymetrics)

------------------------------------------------------------------------
---------- Original Decision Tree Classifiering Data Performance
------------------------------------------------------------------------
Confusion Matrix
[[48017  6554]
 [ 6273  2576]]

-----------------------
Original Decision Tree Classifier score: 0.7977
Balanced Accuracy Score: 0.5855
ROC AUC Score: 0.5861
Mean Squared Error: 0.2023
------------------------------
--- Classification values
------------------------------
Accuracy: 0.7977
Precision: 0.2821
Recall: 0.2911
F1-score: 0.2865
Specificity: 0.8799
False Positive Rate: 0.1201
Matthews Correlation Coefficient: 0.1688

-----------------------
Classification Report
              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88     54571
         1.0       0.28      0.29      0.29      8849

    accuracy                           0.80     63420
   macro avg       0.58      0.59      0.58     63420
weighted avg       0.80 