In [4]:
# importing libraries
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
heart = pd.read_csv(file_content_stream)
# dropping missing values
heart = heart.dropna()
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
# defining the input and target variables
X = heart[['male', 'age', 'currentSmoker', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [9]:
# adaboost model
AB = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500).fit(X_train, Y_train)

# extracting feature importance
AB.feature_importances_ # BMI, totChol, sysBP, glucose, age

array([0.02072402, 0.10932235, 0.01402883, 0.1800376 , 0.17604924,
       0.24758169, 0.10772082, 0.14453546])

In [10]:
# creating new train and test with our top 5 variables
X_train_ada = X_train[['BMI', 'totChol', 'sysBP', 'glucose', 'age']]
X_test_ada = X_test[['BMI', 'totChol', 'sysBP', 'glucose', 'age']]

In [12]:
# adaboost model with our top 5 variables
AB_top5 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500).fit(X_train_ada, Y_train)

# predicting on test
pred1 = AB_top5.predict_proba(X_test_ada)[:, 1]
pred1 = np.where(pred1 < 0.1, 0, 1) # changing to labels

# computing recall
recall_score(Y_test, pred1)

1.0

In [13]:
# building the random forest
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

# extracting feature importance
RF.feature_importances_ # age, sysBP, glucose, BMI, totChol

array([0.0518358 , 0.32865228, 0.0068524 , 0.07800458, 0.29465727,
       0.09334717, 0.03249073, 0.11415977])

In [14]:
# creating new train and test with our top 5 variables
X_train_rf = X_train[['age', 'sysBP', 'glucose', 'BMI', 'totChol']]
X_test_rf = X_test[['age', 'sysBP', 'glucose', 'BMI', 'totChol']]

In [15]:
# building the random forest with our top 5 variables
RF_top5 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_rf, Y_train)

# predicting on test
pred2 = RF_top5.predict_proba(X_test_rf)[:, 1]
pred2 = np.where(pred2 < 0.1, 0, 1) # changing to labels

# computing recall
recall_score(Y_test, pred2)

0.7857142857142857

In [16]:
print('The recall score of the Ada Boost Classifier is', recall_score(Y_test, pred1))
print('The recall score of the Random Forest Classifier is', recall_score(Y_test, pred2))

## We would use the Ada Boost Classifier to predict TenYearCHD because the recall is higher than the Random Forest Classifier.

The recall score of the Ada Boost Classifier is 1.0
The recall score of the Random Forest Classifier is 0.7857142857142857
