<a href="https://colab.research.google.com/github/jasleen1589/Final-Project_Group5/blob/Senelli/CHD_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [84]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_1samp, shapiro
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import recall_score, make_scorer, roc_auc_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score

from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from graphviz import Source
from sklearn.tree import export_graphviz
from sklearn import tree
from IPython.display import SVG, display

!pip install shap
import shap

import warnings
warnings.filterwarnings('ignore')



In [85]:
path = 'cleaned_heart_data.csv'
df = pd.read_csv(path)

In [86]:
# Dataset First Look
df.head()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [87]:
# Dataset Rows & Columns count
print(f'Number of rows in the dataset: {df.shape[0]}')
print(f'Number of columns  in the dataset: {df.shape[1]}')

Number of rows in the dataset: 3658
Number of columns  in the dataset: 16


In [88]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3658 entries, 0 to 3657
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sex              3658 non-null   int64  
 1   age              3658 non-null   int64  
 2   education        3658 non-null   float64
 3   currentSmoker    3658 non-null   int64  
 4   cigsPerDay       3658 non-null   float64
 5   BPMeds           3658 non-null   float64
 6   prevalentStroke  3658 non-null   int64  
 7   prevalentHyp     3658 non-null   int64  
 8   diabetes         3658 non-null   int64  
 9   totChol          3658 non-null   float64
 10  sysBP            3658 non-null   float64
 11  diaBP            3658 non-null   float64
 12  BMI              3658 non-null   float64
 13  heartRate        3658 non-null   float64
 14  glucose          3658 non-null   float64
 15  TenYearCHD       3658 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 457.4 KB


In [89]:
# Dataset Duplicate Value Count
print(f'Number of duplicated rows in the dataset: {df.duplicated().sum()}')

Number of duplicated rows in the dataset: 0


In [90]:
# Missing Values/Null Values Count
print(f'There are {df.isna().sum().sum()} missing values in the dataset\n')
df.isna().sum()

There are 0 missing values in the dataset



sex                0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [91]:
# Dataset Columns
df.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [92]:
# Dataset Describe
df.describe()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,0.489065,9.025424,0.030344,0.005741,0.311646,0.027064,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,0.499949,11.92159,0.171557,0.075561,0.463229,0.162292,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


In [93]:
# Splitting the categorical and continuous variables
categ_vars = ['sex', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']
cont_vars = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']

In [94]:
# Check Unique Values for categorical variables
for var in categ_vars:
  print(f'Unique values in {var} are: {df[var].dropna().unique()})')

Unique values in sex are: [1 0])
Unique values in education are: [4. 2. 1. 3.])
Unique values in currentSmoker are: [0 1])
Unique values in BPMeds are: [0. 1.])
Unique values in prevalentStroke are: [0 1])
Unique values in prevalentHyp are: [0 1])
Unique values in diabetes are: [0 1])


In [95]:
data = df.copy()

In [96]:
X = data.drop(['TenYearCHD'], axis = 1)
Y = data['TenYearCHD']

# Visualising the input data
X.head()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0


In [97]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 8, stratify = Y, shuffle = True)

In [98]:
Y_train.value_counts()

TenYearCHD
0    2480
1     446
Name: count, dtype: int64

In [99]:
Y_test.value_counts()

TenYearCHD
0    621
1    111
Name: count, dtype: int64

In [100]:
# Handling Imbalanced Dataset
smote = SMOTE(random_state = 8)
X_smote, Y_train_final = smote.fit_resample(X_train, Y_train)

In [101]:
# Scaling the train and test data according to train data
scaler = MinMaxScaler()
X_train_final = scaler.fit_transform(X_smote)
X_test_final = scaler.transform(X_test)

In [120]:
input_data = (0,53,3.0,0,0.0,1.0,0,1,1,311.0,206.0,92.0,21.51,76.0,215.0)

In [121]:
# Assuming you have a single observation or multiple observations stored in `X_new`
# Example: X_new = [data_for_new_prediction]
# You should replace this with actual new data for prediction
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, Y_train_final)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = rf_classifier.predict(input_data_reshaped)
# Check the prediction and print the result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

# For printing results of multiple predictions, if X_new contains multiple observations
#for i, pred in enumerate(prediction):
    #if pred == 0:
    #     print(f'Observation {i+1}: The Person does not have Heart Disease')
    # else:
    #     print(f'Observation {i+1}: The Person has Heart Disease')


The Person has Heart Disease


In [None]:
# Function to plot the Confusion Matrix
def confusion_plot(cm):
  '''Plots the Confusion Matrix given as input'''
  cmd = ConfusionMatrixDisplay(cm, display_labels = ['No risk (0)', 'Risk (1)'])
  cmd.plot(cmap = 'Blues')
  plt.title('Confusion Matrix for Test Data')
  plt.show()

# Function to train and test a given classification model
def model_train_test(model, train_x, train_y, test_x, test_y, gs = False, confusion = True):
  '''Trains the classification model given as input. Other inputs include Test and train data
  and a Boolean to inform the function if GridSearch is being performed
  Returns the train and test Recalls and ROC-AUC scores, the test data predictions, and the final model'''

  model.fit(train_x, train_y)
  # if gs == True:
  #   print(f'Best model parameters are: {model.best_params_}')
  #   print(f'Best model score is: {model.best_score_}\n')
  #   model = model.best_estimator_

  # Getting the train and test predictions
  train_preds = model.predict(train_x)
  train_recall = recall_score(y_true = train_y, y_pred = train_preds, average='binary')
  train_acc = accuracy_score(train_preds, train_y)
  test_preds = model.predict(test_x)
  test_recall = recall_score(y_true = test_y, y_pred = test_preds, average='binary')
  test_acc = accuracy_score(test_preds, test_y)


  # change the input data to a numpy array
  input_data_as_numpy_array= np.asarray(input_data)

  # reshape the numpy array as we are predicting for only on instance
  input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

  prediction = model.predict(input_data_reshaped)
  print(prediction)

# Check the prediction and print the result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

# For printing results of multiple predictions, if X_new contains multiple observations
for i, pred in enumerate(prediction):
    if pred == 0:
        print(f'Observation {i+1}: The Person does not have Heart Disease')
    else:
        print(f'Observation {i+1}: The Person has Heart Disease')

  # Plotting confusion matrix
  #if confusion == True:
    #confusion_plot(confusion_matrix(test_y, test_preds))
  # Calculating the confusion matrix
print(confusion_matrix(test_y, test_preds))

output_metrics = {'Train Recall':train_recall, 'Test Recall':test_recall, 'Train ACC':train_acc, 'Test ACC':test_acc}
return output_metrics, test_preds, model

## Logistic Regression

In [None]:
# Training the model
lr_scores, lr_test_preds, lr_model = model_train_test(LogisticRegression(), X_train_final, Y_train_final, X_test_final, Y_test)

In [None]:
# Classification Report
print(classification_report(Y_test, lr_test_preds, target_names=['class-0', 'class-1']))

In [None]:
# Creating a dictionary of lists to store Train and test Recalls and ROC-AUC scores
scores = dict()
scores['Train Recall'] = []
scores['Test Recall'] = []
scores['Train ACC'] = []
scores['Test ACC'] = []
model_names = ['Logistic Regression', 'Random Forest', 'Decision Tree']


In [None]:
# Printing the train and test Recalls and ROC-AUC scores
def print_scores(model_name, model_scores):
  '''Function to print the scores of a given model'''
  print(f"The train and test recalls of the {model_name} Model are: {round(model_scores['Train Recall'] * 100, 2)}% and {round(model_scores['Test Recall'] * 100, 2)}% respectively")
  print(f"The train and test accuracy scores of the {model_name} Model are: {round(model_scores['Train ACC'] * 100, 2)}% and {round(model_scores['Test ACC'] * 100, 2)}% respectively")

print_scores(model_name = model_names[0], model_scores = lr_scores)

## Random Forest

In [None]:
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 42)
scorer = make_scorer(recall_score, average = 'binary')

In [None]:
# Defining the Hyperparameters
params_rf = {
              'n_estimators':[50, 100, 200],
              'max_depth':[3, 4, 5],
              'min_samples_split':[10, 20, 25],
              'min_samples_leaf':[10, 20, 25]
              }

rf_model = RandomForestClassifier(criterion= 'entropy', random_state = 42)
#rf_models = GridSearchCV(rf_model, params_rf, cv = cv, scoring = scorer)

In [None]:
# Check the prediction and print the result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

# For printing results of multiple predictions, if X_new contains multiple observations
for i, pred in enumerate(prediction):
    if pred == 0:
        print(f'Observation {i+1}: The Person does not have Heart Disease')
    else:
        print(f'Observation {i+1}: The Person has Heart Disease')

In [None]:
# Classification Report
print(classification_report(Y_test, lr_test_preds, target_names=['class-0', 'class-1']))

In [None]:
# Printing the model scores
print_scores(model_name = model_names[1], model_scores = rf_scores)

## Decision Trees

In [122]:
# Defining the Hyperparameters
params_dt = {
              'max_depth' : [3, 4, 5],
              'min_samples_split':[10, 20, 25, 30],
              'min_samples_leaf':[10, 20, 25, 30]
              }

dt_model = DecisionTreeClassifier(criterion= 'entropy', random_state = 42)
dt_models = GridSearchCV(dt_model, params_dt, cv = cv, scoring = scorer)

In [123]:
# Check the prediction and print the result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

# For printing results of multiple predictions, if X_new contains multiple observations
for i, pred in enumerate(prediction):
    if pred == 0:
        print(f'Observation {i+1}: The Person does not have Heart Disease')
    else:
        print(f'Observation {i+1}: The Person has Heart Disease')

The Person has Heart Disease
Observation 1: The Person has Heart Disease


In [None]:
#!pip install tabula-py

In [None]:
#from tabula import read_pdf

In [None]:
# Define features and target variable
X = df.drop('TenYearCHD', axis=1)
y = df['TenYearCHD']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Classification Report
print(classification_report(Y_test, dt_test_preds, target_names=['class-0', 'class-1']))

In [None]:
# Printing the model scores
print_scores(model_name = model_names[2], model_scores = dt_scores)