In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

#  Import and read the data.
import pandas as pd 
cdc_df2018 = pd.read_csv('Resources/cdc_heart2018.csv')
cdc_df2018.head()

Unnamed: 0.1,Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,...,_MAM5022,_RFPAP34,_RFPSA22,_RFBLDS3,_COL10YR,_HFOB3YR,_FS5YR,_FOBTFS,_CRCREC,_AIDTST3
0,0,1.0,1.0,b'01052018',b'01',b'05',b'2018',1100.0,b'2018000001',2018000000.0,...,,,,,,,,,,2.0
1,1,1.0,1.0,b'01122018',b'01',b'12',b'2018',1100.0,b'2018000002',2018000000.0,...,,1.0,,,,,,,,2.0
2,2,1.0,1.0,b'01082018',b'01',b'08',b'2018',1100.0,b'2018000003',2018000000.0,...,,,,,,,,,,2.0
3,3,1.0,1.0,b'01032018',b'01',b'03',b'2018',1100.0,b'2018000004',2018000000.0,...,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,4,1.0,1.0,b'01122018',b'01',b'12',b'2018',1100.0,b'2018000005',2018000000.0,...,,,,,,,,,,2.0


In [3]:
# Detemine columns
cdc_df2018 = cdc_df2018[['CVDCRHD4', 'LASTDEN4', 'RMVTETH4']]
cdc_df2018.head()

Unnamed: 0,CVDCRHD4,LASTDEN4,RMVTETH4
0,2.0,1.0,1.0
1,2.0,2.0,8.0
2,2.0,1.0,7.0
3,2.0,3.0,1.0
4,2.0,1.0,8.0


In [4]:
# Rename columns
cdc_df2018 = cdc_df2018.rename(columns={
    "CVDCRHD4": "2018AnginaHeartDisease",
    "LASTDEN4": "2018LastDental",
    "RMVTETH4": "2018TeethRemoved"
})
cdc_df2018.head()

Unnamed: 0,2018AnginaHeartDisease,2018LastDental,2018TeethRemoved
0,2.0,1.0,1.0
1,2.0,2.0,8.0
2,2.0,1.0,7.0
3,2.0,3.0,1.0
4,2.0,1.0,8.0


In [5]:
# Determine the number of unique values in each column.
cdc_df2018.nunique()

2018AnginaHeartDisease    4
2018LastDental            7
2018TeethRemoved          6
dtype: int64

In [6]:
# Look at 2018AnginaHeartDisease value counts
cdc_df2018['2018AnginaHeartDisease'].value_counts()

2.0    407649
1.0     26114
7.0      3379
9.0       271
Name: 2018AnginaHeartDisease, dtype: int64

In [7]:
# Drop 7s and 9s
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018AnginaHeartDisease"] != 7]
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018AnginaHeartDisease"] != 9]

In [8]:
# Look at 2018AnginaHeartDisease value counts
cdc_df2018['2018AnginaHeartDisease'].value_counts()

2.0    407649
1.0     26114
Name: 2018AnginaHeartDisease, dtype: int64

In [9]:
# Look at 2018LastDental value counts
cdc_df2018['2018LastDental'].value_counts()

1.0    294248
4.0     48104
2.0     46165
3.0     38363
7.0      4200
8.0      2352
9.0       327
Name: 2018LastDental, dtype: int64

In [10]:
# Drop 7s and 9s
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018LastDental"] != 7]
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018LastDental"] != 9]

In [11]:
# Look at 2018LastDental value counts
cdc_df2018['2018LastDental'].value_counts()

1.0    294248
4.0     48104
2.0     46165
3.0     38363
8.0      2352
Name: 2018LastDental, dtype: int64

In [12]:
# Look at 2018TeethRemoved value counts
cdc_df2018['2018TeethRemoved'].value_counts()

8.0    215998
1.0    125426
2.0     50357
3.0     27972
7.0      8967
9.0       511
Name: 2018TeethRemoved, dtype: int64

In [13]:
# Drop 7s and 9s
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018TeethRemoved"] != 7]
cdc_df2018 = cdc_df2018.loc[cdc_df2018["2018TeethRemoved"] != 9]

In [14]:
# Look at 2018TeethRemoved value counts
cdc_df2018['2018TeethRemoved'].value_counts()

8.0    215998
1.0    125426
2.0     50357
3.0     27972
Name: 2018TeethRemoved, dtype: int64

In [15]:
# Split our preprocessed data into our features and target arrays
y = cdc_df2018.dropna()["2018AnginaHeartDisease"]
X = cdc_df2018.drop(columns=["2018AnginaHeartDisease"]).dropna()

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create a random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [18]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [20]:
# Calculating the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cm = confusion_matrix(y_test, predictions)

In [21]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [22]:
# Displaying results
target_names = ["YES", "NO"]
print("Confusion Matrix")
display(cm)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


array([[    0,  6242],
       [    0, 98697]])

Accuracy Score: 0.9405178246409819
Classification Report
              precision    recall  f1-score   support

         YES       0.00      0.00      0.00      6242
          NO       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import balanced_accuracy_score

lr_model=LogisticRegression(max_iter=200,random_state=1)
rf_model=RandomForestClassifier(random_state=56,n_estimators=20)
gnb_model=GaussianNB(var_smoothing=1e-03)
bg_model=BaggingClassifier(n_estimators=13,random_state=2)
knn_model=KNeighborsClassifier(n_neighbors=20)

models={"LogisticRegression":lr_model,
        'RandomForest':rf_model,
        'GaussianNB':gnb_model,
        'BaggingClassifier':bg_model,
        'KNeighborsClassifier':knn_model}

for x in models:
    model=models[x]
    model.fit(X_train,y_train)
    predictions=model.predict(X_test)
    print(f'*******   {x}   ******')
    print(f"balanced accuracy score: {balanced_accuracy_score(y_test, predictions)}")
    # Print the classification report for the model
    target_names = ["YES", "NO"]
    print(classification_report(y_test, predictions, target_names=target_names))
    print('*'*40)

*******   LogisticRegression   ******
balanced accuracy score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         YES       0.00      0.00      0.00      6242
          NO       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939

****************************************
*******   RandomForest   ******
balanced accuracy score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         YES       0.00      0.00      0.00      6242
          NO       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939

****************************************
*******   GaussianNB   ******
balanced accuracy score: 0.49996853019191234
              precision    recall  f1-score   support

         YES       0.06      0.01      0.01      6242
          NO       0.94      0.99      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.50      0.50      0.49    104939
weighted avg       0.89      0.94      0.91    104939

****************************************
*******   BaggingClassifier   ******
balanced accuracy score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         YES       0.00      0.00      0.00      6242
          NO       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939

****************************************
*******   KNeighborsClassifier   ******
balanced accuracy score: 0.5
              precision    recall  f1-score   support

         YES       0.00      0.00      0.00      6242
          NO       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939

****************************************


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train)
hidden_nodes_layer1 = 300
hidden_nodes_layer2 = 150
hidden_nodes_layer3 = 75

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid")
)

# Third hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid")
)

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

2023-05-22 18:16:31.750871: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-22 18:16:31.753907: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               94444500  
_________________________________________________________________
dense_1 (Dense)              (None, 150)               45150     
_________________________________________________________________
dense_2 (Dense)              (None, 75)                11325     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 76        
Total params: 94,501,051
Trainable params: 94,501,051
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [26]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [27]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [28]:
# Calculating the confusion matrix 
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Accuracy Score: ",  {acc_score})
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,6242
Actual 1,0,98697


Accuracy Score:  {0.9405178246409819}
Classification Report
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      6242
         2.0       0.94      1.00      0.97     98697

    accuracy                           0.94    104939
   macro avg       0.47      0.50      0.48    104939
weighted avg       0.88      0.94      0.91    104939



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
