<a href="https://colab.research.google.com/github/hatkiet/Project_4/blob/Mia-branch/Heart_Attack_ML_3_MH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import required libraries and dependencies
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report

In [3]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("heart_2022_with_nans.csv")
# Display sample data
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [4]:
# Find the duplicated values
df.duplicated().sum()

# Drop duplicated values
df.drop_duplicates(inplace=True)

# Check again information of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87517 entries, 0 to 87551
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      87517 non-null  object 
 1   Sex                        87517 non-null  object 
 2   GeneralHealth              87255 non-null  object 
 3   PhysicalHealthDays         85065 non-null  float64
 4   MentalHealthDays           85469 non-null  float64
 5   LastCheckupTime            85842 non-null  object 
 6   PhysicalActivities         87286 non-null  object 
 7   SleepHours                 86410 non-null  float64
 8   RemovedTeeth               84112 non-null  object 
 9   HadHeartAttack             86800 non-null  object 
 10  HadAngina                  86577 non-null  object 
 11  HadStroke                  87162 non-null  object 
 12  HadAsthma                  87127 non-null  object 
 13  HadSkinCancer              86807 non-null  object 


In [5]:
# Finding NAN values
nan_values = df.isna().sum().sort_values(ascending=False)
nan_values_percentage = (nan_values/df.shape[0])*100

# Output the DataFrame with two columns nan_values and nan_values_percentage
pd.concat([nan_values, nan_values_percentage], axis=1).rename(columns= {0:"NaN Values", 1:"(%) of NaN Values"}).round(2)

Unnamed: 0,NaN Values,(%) of NaN Values
TetanusLast10Tdap,17934,20.49
PneumoVaxEver,16939,19.36
HIVTesting,14780,16.89
ChestScan,12478,14.26
CovidPos,12001,13.71
HighRiskLastYear,11886,13.58
FluVaxLast12,11003,12.57
AlcoholDrinkers,10778,12.32
BMI,10232,11.69
WeightInKilograms,8704,9.95


In [7]:
# Renaming columns using a dictionary
df1 = df.rename(columns={
    'AgeCategory': "Age",
    'RaceEthnicityCategory': "Race",
    'GeneralHealth': "GenHealth",
    'PhysicalHealthDays': "PhysicalHealth",
    'PhysicalActivities': "PhysicalActivity",
    'MentalHealthDays': "MentalHealth",
    'HadAngina': "Angina",
    'HadHeartAttack': "HeartAttack",
    'HadStroke': "Stroke",
    'HadAsthma': "Asthma",
    'HadSkinCancer': "SkinCancer",
    'HadDepressiveDisorder': "Depressed",
    'HadKidneyDisease': "KidneyDisease",
    'DifficultyWalking': "DiffWalking",
    'HadDiabetes': "Diabetes",
    'SmokerStatus': "Smoking",
    'AlcoholDrinkers': "Drinking"
})
df1

Unnamed: 0,State,Sex,GenHealth,PhysicalHealth,MentalHealth,LastCheckupTime,PhysicalActivity,SleepHours,RemovedTeeth,HeartAttack,...,HeightInMeters,WeightInKilograms,BMI,Drinking,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.60,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.50,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.50,23.30,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87547,Hawaii,Male,Good,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.75,72.57,23.63,No,,Yes,,,,
87548,Hawaii,Male,Very good,1.0,5.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.91,95.25,26.25,No,Yes,Yes,No,"Yes, received Tdap",Yes,No
87549,Hawaii,Male,Fair,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.75,117.93,38.39,Yes,,No,No,,No,Yes
87550,Hawaii,Male,Good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.70,79.38,27.41,Yes,Yes,Yes,No,"Yes, received Tdap",No,No


In [8]:
# Remove the missing values
df1 = df1.dropna()

# Check again the missing values
df1.isna().sum()

State                        0
Sex                          0
GenHealth                    0
PhysicalHealth               0
MentalHealth                 0
LastCheckupTime              0
PhysicalActivity             0
SleepHours                   0
RemovedTeeth                 0
HeartAttack                  0
Angina                       0
Stroke                       0
Asthma                       0
SkinCancer                   0
HadCOPD                      0
Depressed                    0
KidneyDisease                0
HadArthritis                 0
Diabetes                     0
DeafOrHardOfHearing          0
BlindOrVisionDifficulty      0
DifficultyConcentrating      0
DiffWalking                  0
DifficultyDressingBathing    0
DifficultyErrands            0
Smoking                      0
ECigaretteUsage              0
ChestScan                    0
Race                         0
Age                          0
HeightInMeters               0
WeightInKilograms            0
BMI     