In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna as op
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from sklearn.preprocessing import LabelEncoder

In [12]:
# Load data
df = pd.read_csv(r"C:\Users\HP\Documents\datascience projects\heart attack prediction\heart_attack_youngsters_india.csv")
df.columns = df.columns.str.replace(r"\(.*?\)", "", regex=True).str.strip().str.replace(" ","_")

In [None]:
# Drop unnecessary columns
columns_to_drop = [
    'Region', 'Screen_Time', 'Diet_Type', 'ECG_Results', 
    'Family_History_of_Heart_Disease', 'Resting_Heart_Rate', 
    'Blood_Oxygen_Levels', 'Triglyceride_Levels'
]

# Drop columns in the dataset
X = df.drop(columns=columns_to_drop)

# Split data again after dropping unnecessary columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Verify class distribution after SMOTE
print("After SMOTE class distribution:", Counter(y_train_smote))

# Normalize the features
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_smote)
X_test_std = scaler.transform(X_test)

# Apply polynomial transformation
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_std)
X_test_poly = poly.transform(X_test_std)

# Logistic Regression Model
model = LogisticRegression(
    penalty='l2',
    C=0.4347658976229592,
    random_state=42,
    solver='lbfgs',
    max_iter=1000,
    n_jobs=-1,
)
lor_model = model.fit(X_train_poly, y_train_smote)
y_cap = lor_model.predict(X_test_poly)

# Save the model and the transformer
joblib.dump(lor_model, "Heart_Attack_Prediction_model.pkl")
joblib.dump(poly, "poly_transformer.pkl")


In [13]:
# Data preprocessing
df["Systolic"] = df["Blood_Pressure"].str.split("/",expand=True)[0].astype("float")
df["Diastolic"] = df["Blood_Pressure"].str.split("/",expand=True)[1].astype("float64")
df.drop("Blood_Pressure", axis=1, inplace=True)

In [14]:
# Separate features and target variable
X = df.drop(columns=['Heart_Attack_Likelihood'])
y = df['Heart_Attack_Likelihood']

In [15]:
# Encode categorical columns
cat_cols = X.select_dtypes(include='object').columns
encoder = LabelEncoder()
for col in cat_cols:
    X[col] = encoder.fit_transform(X[col])

# Encode target variable
y = encoder.fit_transform(y)

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [17]:
# Standardize the features
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_smote)
X_test_std = scaler.transform(X_test)

In [18]:
# Apply PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_std)
X_test_poly = poly.transform(X_test_std)

In [19]:
# Train Logistic Regression model
model = LogisticRegression(penalty='l2', C=0.4347658976229592, random_state=42, solver='lbfgs', max_iter=1000, n_jobs=-1)
lor_model = model.fit(X_train_poly, y_train_smote)

In [20]:
# Predict on the test set
y_pred = lor_model.predict(X_test_poly)

In [21]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy Score: 0.656
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.78      0.78      2389
           1       0.17      0.18      0.18       611

    accuracy                           0.66      3000
   macro avg       0.48      0.48      0.48      3000
weighted avg       0.66      0.66      0.66      3000

Confusion Matrix:
 [[1856  533]
 [ 499  112]]


In [22]:
# Save the model and the transformer
joblib.dump(lor_model, "Heart_Attack_Prediction_model1.pkl")
joblib.dump(poly, "poly_transformer1.pkl")
joblib.dump(scaler, "scaler1.pkl")


['scaler1.pkl']