In [1]:
import pandas as pd 

In [2]:
# Load data set into pandas
df = pd.read_csv ("Cleaned_dataset.csv")
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,FluVaxLast12,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,No,"White only, Non-Hispanic",Age 65 to 69,1.6,71.67,27.99,No,Yes,No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,No,"White only, Non-Hispanic",Age 70 to 74,1.78,95.25,30.13,No,Yes,No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,Yes,"White only, Non-Hispanic",Age 75 to 79,1.85,108.86,31.66,Yes,No,No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,No,"White only, Non-Hispanic",Age 80 or older,1.7,90.72,31.32,No,Yes,No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,No,"White only, Non-Hispanic",Age 80 or older,1.55,79.38,33.07,No,Yes,No,No


In [3]:
# Check the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296352 entries, 0 to 296351
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      296352 non-null  object 
 1   Sex                        296352 non-null  object 
 2   GeneralHealth              296352 non-null  object 
 3   PhysicalHealthDays         296352 non-null  float64
 4   MentalHealthDays           296352 non-null  float64
 5   LastCheckupTime            296352 non-null  object 
 6   PhysicalActivities         296352 non-null  object 
 7   SleepHours                 296352 non-null  float64
 8   RemovedTeeth               296352 non-null  object 
 9   HadHeartAttack             296352 non-null  object 
 10  HadAngina                  296352 non-null  object 
 11  HadStroke                  296352 non-null  object 
 12  HadAsthma                  296352 non-null  object 
 13  HadSkinCancer              29

In [4]:
# Identify binary columns
binary_columns = [column for column in df.columns if df[column].nunique()==2]
# Convert binary columns to 0 and 1
for col in binary_columns:
    unique_values = df[col].unique()
    if set(unique_values) == {"Yes", "No"}:
        df[col] = df[col].map({"Yes": 1, "No": 0})

# Verify conversion
for column in binary_columns:
    print (df[column].unique())

['Female' 'Male']
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]


In [5]:
catogorical_columns = ['Sex', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes', 
                      'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory', 
                       'HighRiskLastYear', 'CovidPos']

# Extract the categorical columns into a new DataFrame
Catogorical_col = df[catogorical_columns]

# Use pd.get_dummies to transform the categorical columns
Transform_df = pd.get_dummies(Catogorical_col, dtype=int)

# Display the first few rows of the transformed DataFrame
Transform_df.head()

Unnamed: 0,HighRiskLastYear,Sex_Female,Sex_Male,GeneralHealth_Excellent,GeneralHealth_Fair,GeneralHealth_Good,GeneralHealth_Poor,GeneralHealth_Very good,LastCheckupTime_5 or more years ago,LastCheckupTime_Within past 2 years (1 year but less than 2 years ago),...,AgeCategory_Age 50 to 54,AgeCategory_Age 55 to 59,AgeCategory_Age 60 to 64,AgeCategory_Age 65 to 69,AgeCategory_Age 70 to 74,AgeCategory_Age 75 to 79,AgeCategory_Age 80 or older,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
2,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [6]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns
numeric_columns = df.select_dtypes(include=["float64"]).columns
df_numeric = df[numeric_columns]

# Create standard scaler instane, fitting it, and scaling it
scaler = StandardScaler()
scaled_numeric_df = pd.DataFrame(scaler.fit_transform(df_numeric), columns=numeric_columns)

In [7]:
# Drop the original columns
df = df.drop(columns=catogorical_columns+numeric_columns.tolist())

# Combine the original DataFrame and the dummy variables DataFrame
merged_df = pd.concat([df, Transform_df, scaled_numeric_df], axis=1)
merged_df = merged_df.drop(columns='State')
# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,PhysicalActivities,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,...,AgeCategory_Age 80 or older,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
0,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,-0.008726,-0.523179,1.382336,-0.99559,-0.554117,-0.092198
1,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,-0.488297,-0.523179,-0.71567,0.692103,0.551293,0.236631
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,-0.488297,-0.523179,0.683001,1.348428,1.189318,0.47173
3,1,0,0,0,0,1,0,1,0,1,...,1,0,0,1,0.111167,-0.523179,1.382336,-0.057983,0.338931,0.419486
4,1,0,0,0,0,0,0,0,0,1,...,1,1,0,0,-0.128618,1.316711,-1.415005,-1.464394,-0.192679,0.688389


In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = merged_df.drop(columns = "HadHeartAttack")
y = merged_df["HadHeartAttack"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Display the shapes of the resulting datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(207446, 73) (88906, 73) (207446,) (88906,)


In [14]:
from sklearn.linear_model import LogisticRegression
# Create a Logistic Regression Model
classifier = LogisticRegression(solver = 'lbfgs',
                               max_iter = 200,
                               random_state = 42)
# Fit (train) model using the training data
classifier.fit(X_train, y_train)

In [15]:
# Score the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")

Training Data Score: 0.9508016544064479


In [16]:
# Make predictions
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction":predictions, "Actual":y_test}).reset_index(drop=True)
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Testing Data Score: 0.9507007401075293


In [21]:
# Evaluate the model 
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Test Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


Test Set Evaluation:
Accuracy: 0.9507007401075293
Confusion Matrix:
 [[83363   891]
 [ 3492  1160]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     84254
           1       0.57      0.25      0.35      4652

    accuracy                           0.95     88906
   macro avg       0.76      0.62      0.66     88906
weighted avg       0.94      0.95      0.94     88906

