In [50]:
# seed value for random number generators to obtain reproducible results
RANDOM_SEED = 1

# although we standardize X and y variables on input,
# we will fit the intercept term in the models
# Expect fitted values to be close to zero
SET_FIT_INTERCEPT = True

# import base packages into the namespace for this program
import numpy as np
import pandas as pd

# modeling routines from Scikit Learn packages
import sklearn.linear_model 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier #decision tree classifierfrom sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler #scale data
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO 
from IPython.display import Image 
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix 
from math import sqrt  # for root mean-squared error calculation
import matplotlib
import matplotlib.pyplot as plt  # static plotting
import seaborn as sns  # pretty plotting, including heat map


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
# specify the k-fold cross-validation design
from sklearn.model_selection import KFold

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.preprocessing import binarize
from sklearn.metrics import precision_recall_curve

In [34]:
# define a function to return model metrics for evaluation

def model_metrics(y_known, y_pred):
    y_test = y_known
    y_pred_class = y_pred
    confusion = metrics.confusion_matrix(y_test, y_pred_class)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    accuracy = metrics.accuracy_score(y_test, y_pred_class)
    class_error = 1 - metrics.accuracy_score(y_test, y_pred_class)
    sensitivity = metrics.recall_score(y_test, y_pred_class)
    specificity = TN / (TN + FP)
    false_positive_rate = FP / float(TN + FP)
    precision = TP / float(TP + FP)
    f1_score = metrics.f1_score(y_test, y_pred_class)
#    F1 = 2*precision*sensitivity/(precision + sensitivity)
    print('The model metrics are:',
         '\naccuracy:', accuracy,
         '\nclassification error:', class_error,
         '\nsensitivity:', sensitivity,
         '\nspecificity:', specificity,
         '\nfalse positive rate:', false_positive_rate,
         '\nprecision:', precision,
          '\nF1 score:', f1_score,
#          '\nF1 by hand:', F1,
         '\nconfusion matrix:')
    return(confusion)


In [35]:
# define a function that accepts a threshold and 
# prints sensitivity and specificity

def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Specificity:', 1 - fpr[thresholds > threshold][-1])

In [36]:
# define a function that accepts the fpr and tpr values 
# from the roc_curve function and plot the ROC curve

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve for response')

In [37]:
# define a function that accepts the recall and precision values 
# from the precision_recall_curve function and plot the PR curve

def plot_pr_curve(recall, precision, label=None):
    plt.plot(recall, precision, linewidth=2, label=label)
#    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('PR curve for response')

In [38]:
# initial work with the smaller data set
X_test = pd.read_csv('cardio_x_test.csv', sep = ',', encoding = 'unicode_escape')  
X_train = pd.read_csv('cardio_x_train.csv', sep = ',', encoding = 'unicode_escape')  
y_test = pd.read_csv('cardio_y_test.csv', sep = ',', encoding = 'unicode_escape')  
y_train = pd.read_csv('cardio_y_train.csv', sep = ',', encoding = 'unicode_escape')
# examine the shape of original input data
print('The shape of the X test set is: ', X_test.shape)
print('The shape of the X train set is: ',X_train.shape)
print('The shape of the Y test set is: ',y_test.shape)
print('The shape of the Y train set is: ',y_train.shape)

The shape of the X test set is:  (13750, 24)
The shape of the X train set is:  (54997, 24)
The shape of the Y test set is:  (13750, 2)
The shape of the Y train set is:  (54997, 2)


In [39]:
#drop unnecessary columns for model
X_train = X_train.drop(['Unnamed: 0', 'id', 'age_days', 'LATITUDE',
                 'LONGITUDE','ZIP','FACILITY_NAME','ADDRESS',
                 'CITY','STATE','DESCRIPTION','FACILITY_TYPE' ], axis=1)
X_test = X_test.drop(['Unnamed: 0', 'id', 'age_days', 'LATITUDE',
                 'LONGITUDE','ZIP','FACILITY_NAME','ADDRESS',
                 'CITY','STATE','DESCRIPTION','FACILITY_TYPE' ], axis=1)
y_train = y_train.drop(['Unnamed: 0'], axis=1)
y_test = y_test.drop(['Unnamed: 0'], axis=1)    


In [40]:
y_train = y_train.rename(columns={"x": "cardio"})
y_test = y_test.rename(columns={"x": "cardio"})

In [41]:
#total number of NaN values in each column
print('The total number of NaN values in the X test set is:\n', X_test.isnull().sum())
print('The total number of NaN values in the X train set is:\n',X_train.isnull().sum())
print('The total number of NaN values in the Y test set is:\n',y_test.isnull().sum())
print('The total number of NaN values in the Y train set is:\n',y_train.isnull().sum())

The total number of NaN values in the X test set is:
 age_years      0
gender         0
height_cm      0
weight_kg      0
BMI            0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alcohol        0
active         0
dtype: int64
The total number of NaN values in the X train set is:
 age_years      0
gender         0
height_cm      0
weight_kg      0
BMI            0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alcohol        0
active         0
dtype: int64
The total number of NaN values in the Y test set is:
 cardio    0
dtype: int64
The total number of NaN values in the Y train set is:
 cardio    0
dtype: int64


In [42]:
# mapping function to convert text female/male to integer 1/2
convert_to_binary = {'female' : 1, 'male' : 2}

# define binary variable for gender
X_train['gender'] = X_train['gender'].map(convert_to_binary)
X_test['gender'] = X_test['gender'].map(convert_to_binary)

In [43]:
X_train.head()

Unnamed: 0,age_years,gender,height_cm,weight_kg,BMI,ap_hi,ap_lo,cholesterol,gluc,smoke,alcohol,active
0,62,1,160,101.0,39.453125,120,70,1,1,0,0,1
1,41,2,172,78.0,26.365603,140,80,1,1,0,0,1
2,57,1,168,80.0,28.344671,140,90,1,1,0,0,1
3,52,1,161,75.0,28.934069,110,80,2,3,0,0,1
4,39,1,169,64.0,22.408179,100,70,1,1,0,0,1


In [44]:
y_train.head()

Unnamed: 0,cardio
0,1
1,0
2,1
3,0
4,0


In [45]:
#Set the decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [48]:
#make prediction
y_pred = dt.predict(X_test)

In [51]:
# precision, recall, F1 score, and count of response variable
# of decision tree
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4549 2418]
 [2574 4209]]
              precision    recall  f1-score   support

           0       0.64      0.65      0.65      6967
           1       0.64      0.62      0.63      6783

    accuracy                           0.64     13750
   macro avg       0.64      0.64      0.64     13750
weighted avg       0.64      0.64      0.64     13750



In [52]:
# evaluate gradient boosting classifer
model_metrics(y_test, y_pred)

The model metrics are: 
accuracy: 0.6369454545454546 
classification error: 0.3630545454545454 
sensitivity: 0.6205218929677134 
specificity: 0.6529352662552032 
false positive rate: 0.3470647337447969 
precision: 0.6351290176550475 
F1 score: 0.6277404921700224 
confusion matrix:


array([[4549, 2418],
       [2574, 4209]], dtype=int64)