In [1]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib
import config
import sklearn


sys.path.insert(1, config.package_path)
import ml_analysis as mlanlys
import ml_clean_feature as mlclean

In [2]:
# reload any changes to Config Settings
importlib.reload(config)

# BE SURE TO UPDATE THE LABEL FOR THIS ANALYSIS
# #############################
dataset_label = '2.0 StandardScaler Dataset'
# #############################

year                        = config.year
clean_file                  = config.clean_file

prepared_data_standard              = config.prepared_data_standard
prepared_data_minmax                = config.prepared_data_minmax
prepared_data_binary                = config.prepared_data_binary
prepared_data_sb_random_undersample = config.prepared_data_sb_random_undersample
prepared_data_sb_random_oversample  = config.prepared_data_sb_random_oversample
prepared_data_sb_cluster            = config.prepared_data_sb_cluster
prepared_data_sb_smote              = config.prepared_data_sb_smote
prepared_data_sb_smoteenn           = config.prepared_data_sb_smoteenn

print(f"Year:                        {year}")


Year:                        2015


In [3]:
# Read File
prepared_file = prepared_data_binary

with open(prepared_file, 'rb') as file: data_prepared = pickle.load(file)


In [4]:
X_train, X_test, y_train, y_test = data_prepared
print(f"After write:  lengths:  X_train: {len(X_train)}  X_test: {len(X_test)}  y_train: {len(y_train)} y_test: {len(y_test)}")
print(f"\ny_train.value_counts {y_train.value_counts()}")
print(f"\ny_test.value_counts {y_test.value_counts()}")

After write:  lengths:  X_train: 190260  X_test: 63420  y_train: 190260 y_test: 63420

y_train.value_counts diabetes
0.0    163788
1.0     26472
Name: count, dtype: int64

y_test.value_counts diabetes
0.0    54546
1.0     8874
Name: count, dtype: int64


In [5]:
print(X_train)

        HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
74419      1.0       1.0        1.0  27.0     0.0     0.0   
253053     1.0       1.0        1.0  25.0     0.0     0.0   
57832      0.0       0.0        1.0  54.0     0.0     0.0   
159068     0.0       0.0        1.0  25.0     0.0     0.0   
115612     0.0       0.0        1.0  26.0     0.0     0.0   
...        ...       ...        ...   ...     ...     ...   
132687     0.0       1.0        1.0  27.0     0.0     0.0   
102716     1.0       0.0        1.0  36.0     1.0     0.0   
178749     1.0       0.0        1.0  25.0     1.0     0.0   
209799     0.0       1.0        1.0  32.0     0.0     0.0   
213270     1.0       0.0        1.0  38.0     0.0     0.0   

        HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  \
74419                    0.0           1.0     1.0      1.0  ...   
253053                   0.0           0.0     0.0      1.0  ...   
57832                    0.0           1.0     0.0      0.0  ..

In [6]:
print(X_test)

        HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
37649      0.0       1.0        1.0  27.0     0.0     0.0   
85527      0.0       0.0        1.0  27.0     1.0     0.0   
189766     1.0       0.0        1.0  34.0     0.0     0.0   
77560      1.0       0.0        1.0  39.0     0.0     0.0   
77669      1.0       1.0        1.0  29.0     0.0     0.0   
...        ...       ...        ...   ...     ...     ...   
93647      0.0       0.0        1.0  33.0     1.0     0.0   
195977     1.0       0.0        1.0  28.0     0.0     0.0   
140958     0.0       0.0        1.0  23.0     1.0     0.0   
19157      1.0       1.0        1.0  31.0     0.0     0.0   
154301     1.0       0.0        1.0  26.0     1.0     0.0   

        HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  \
37649                    0.0           1.0     0.0      1.0  ...   
85527                    0.0           1.0     0.0      1.0  ...   
189766                   0.0           1.0     1.0      1.0  ..

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [8]:
def model_performance_metrics(model, data, datalabel):
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    from sklearn.metrics import balanced_accuracy_score
    from sklearn.metrics import roc_auc_score    
    # Expand the model data
    X, y, y_pred = data

    # -------------------------------------- Model Performance
    print(f'------------------------------------------------------------------------')
    print(f'---------- {datalabel}ing Data Performance\n------------------------------------------------------------------------')
    # -----  Create a confusion matrix
    conf_mat = confusion_matrix(y, y_pred)
    print(f"Confusion Matrix\n{conf_mat}")
#        print(f"Confusion Matrix\n{confusion_matrix(y, y_pred, labels = [1,0])}")
    
    # -----  Score
    score_model = model.score(X, y)
    print(f'\n-----------------------\n{datalabel} score: {score_model}')

    # -----  Balanced Accuracy
    score_ba = balanced_accuracy_score(y, y_pred)
    print(f"Balanced Accuracy Score: {score_ba}")

    # -----  ROC AUC Score
    if len(y.value_counts())>2:
        score_roc_auc = roc_auc_score(y, model.predict_proba(X), multi_class='ovr')
    else:
        score_roc_auc = roc_auc_score(y, model.predict_proba(X)[:, 1])

    print(f"ROC AUC Score: {score_roc_auc}")

    # -----  Create a classification report
    print(f"\n-----------------------\nClassification Report\n{classification_report(y, y_pred)}")

    # ----- metrics caluculated from the Confusion Matrix
    # Unravel the confusion matrix
    tn, fp, fn, tp = conf_mat.ravel()

    print(f"------------------------------")
    print(f"--- Classification values")
    print(f"------------------------------")
    # Computer acuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("Accuracy:", accuracy)

    precision = tp / (tp + fp)
    print("Precision:", precision)

    recall = tp / (tp + fn)
    print("Recall:", recall)

    f1 = 2 * (precision * recall) / (precision + recall)
    print("F1-score:", f1)

    specificity = tn / (tn + fp)
    print("Specificity:", specificity)

    fpr = fp / (fp + tn)
    print("False Positive Rate:", fpr)

    mcc = (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5)
    print("Matthews Correlation Coefficient:", mcc)

    return {'model': type(model).__name__, 
            'slice': datalabel,
            'score':score_model, 
            'balanced_accuracy': score_ba, 
            'roc_auc_score':score_roc_auc,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'Specificity': specificity,
            'False Positive Rate': fpr,
            'Matthews Correlation Coefficient': mcc
            }

In [9]:
type(X_train)

pandas.core.frame.DataFrame

In [10]:
type(y_train)

pandas.core.series.Series

In [11]:
# Read final cleaned dataset from parquet file
df = pd.read_parquet(clean_file, engine="fastparquet")
target = 'diabetes'
X = df.copy().drop(columns=[target])
y = df[target]

from sklearn.model_selection import train_test_split
data = sklearn.model_selection.train_test_split(X, y)
X_train, X_test, y_train, y_test = data

In [15]:
X_train, X_test, y_train, y_test = data_prepared

data = data_prepared.copy()
X_train, X_test, y_train, y_test = data

In [16]:
model = DecisionTreeClassifier()
# Train the model
model = model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

test_data = [ X_test, y_test, y_test_pred]

mymetrics = model_performance_metrics(model, test_data, "Original Decision Tree Classifier" )

# Evaluate the classifier
mse_orig = mean_squared_error(y_test, y_test_pred)

print(f"Mean Squared Error of Decision Tree Classifier: {mse_orig}")
print(mymetrics)

ValueError: cannot set WRITEABLE flag to True of this array