<h1 style="color:#00FFFF;">Heart Risk Prediction</h1>


Heart disease is one of the primary reasons for health complications, and early detection is essential. This project uses machine learning to forecast heart attack risk from important medical parameters like blood pressure, cholesterol levels, age, and lifestyle factors.

With methods such as undersampling, SMOTE, ensemble learning, threshold tuning, and hyperparameter tuning, the model seeks to enhance precision and recall in high-risk patients. The final model, implemented through Streamlit, offers a user-friendly interface for risk evaluation.

In [2]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score

In [4]:
df=pd.read_csv('heart_2022_with_nans UPDATED.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'heart_2022_with_nans UPDATED.csv'

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_percentage.index, y=missing_percentage.values)
plt.xticks(rotation=90)
plt.title('Percentage of Missing Values in Each Column')
plt.ylabel('Percentage Missing')
plt.show()

In [None]:
df = df.dropna( subset=['HadHeartAttack'] )
df.shape
df

In [None]:
duplicate_rows = df[df.duplicated()]
print(f"Total duplicate rows: {len(duplicate_rows)}")


In [None]:
duplicate_counts = {col: df[col].duplicated().sum() for col in df.columns}
print(duplicate_counts)


In [None]:
for col in df.columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())


In [None]:
df.describe(include='object')

In [None]:
for col in df.describe(include='object').columns:
    print('Column Name: ',col)
    print(df[col].unique())
    print('-------------------------------------------------')

In [None]:
df['HadDiabetes'] = df['HadDiabetes'].replace({
    'No, pre-diabetes or borderline diabetes': 'Borderline',
    'Yes, but only during pregnancy (female)': 'During Pregnancy'
})


df['HadDiabetes'].unique()

In [None]:
df.HadDiabetes

In [None]:
df['SmokerStatus'] = df['SmokerStatus'].replace({
    'Current smoker - now smokes some days': 'Current smoker (Some days)',
    'Current smoker - now smokes every day': 'Current smoker (Every day)'
})

df['SmokerStatus'].unique()  # Check unique values


In [None]:
df['ECigaretteUsage'] = df['ECigaretteUsage'].replace({
    'Not at all (right now)': 'Not at all',
    'Never used e-cigarettes in my entire life': 'Never',
    'Use them every day': 'Everyday',
    'Use them some days': 'Somedays'
})

df['ECigaretteUsage'].unique()  # Check unique values


In [None]:
df

In [None]:
df['AgeCategory'] = df['AgeCategory'].astype(str)  # Convert NaNs to strings
df['AgeCategory'] = (df['AgeCategory']
                     .str.replace("Age ", "")
                     .str.replace(" or older", "+")
                     .str.replace(" to ", "-")
                     .str.strip())  # Remove spaces


In [None]:
df.AgeCategory.unique()

In [None]:
def age_group(age):
    young = ['18-24', '25-29', '30-34', '35-39']
    middle_aged = ['40-44', '45-49', '50-54', '55-59']
    old = ['60-64', '65-69', '70-74', '75-79', '80+']

    if age in young:
        return "Young"
    elif age in middle_aged:
        return "Middle-aged"
    elif age in old:
        return "Old"
    else:
        return "Unknown"  # Handles NaN or unexpected values

In [None]:
df['AgeCategory'] = df['AgeCategory'].apply(age_group)

# 4. Verify the unique values
print(df['AgeCategory'].unique())


In [None]:
df.describe(include='object')

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.describe()



# **Visualizations (histplot(), distplot(), countplot(), barplot())**


---



In [None]:
gender_count = df['Sex'].value_counts()
plt.title("Gender Distribution", fontsize = 12, weight='bold')
plt.pie(gender_count,labels=gender_count.index,radius=1, autopct='%.2f%%')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.set(style='darkgrid')
x = sns.countplot(data=df, x='AgeCategory')
plt.title("Distribution of Age Category", fontsize=12)
plt.xlabel("Age Category", fontsize=10)
plt.ylabel("Individuals", fontsize=10)
for c in x.containers:
    x.bar_label(c)

plt.show()


In [None]:
plt.figure(figsize=(5, 5))
sns.set(style='darkgrid')
x = sns.countplot(data=df, x='HadHeartAttack', hue='Sex', palette='viridis')
plt.title("Prevalence of Heart Attacks Among Different Genders", fontsize=12)
plt.xlabel("Had Heart Attack", fontsize=10)
plt.ylabel("Individuals", fontsize=10)
for c in x.containers:
    x.bar_label(c)

plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.set(style='darkgrid')
x = sns.countplot(data=df, x='AgeCategory', hue='HadHeartAttack', palette='viridis')
plt.title("Prevalence of Heart Attacks Among Different Age Groups", fontsize=12)
plt.xlabel("Had Heart Attack", fontsize=10)
plt.ylabel("Individuals", fontsize=10)
for c in x.containers:
    x.bar_label(c)

plt.show()

In [None]:
df['HadObesity'] = df['BMI'].apply(lambda x: 'Yes' if x >=30 else 'No')

In [None]:
plt.figure(figsize=(15, 12))
sns.set(style='darkgrid')
risk_factors = ['PhysicalActivities', 'SmokerStatus', 'ECigaretteUsage', 'HadDiabetes', 'HadObesity']
for i, risk_factor in enumerate(risk_factors, 1):
    plt.subplot(3, 2, i)
    x = sns.countplot(data=df, x=risk_factor, hue='Sex', palette='viridis')
    plt.title(f"{risk_factor} Among Different Genders", fontsize=15)
    plt.xlabel(risk_factor, fontsize=12)
    plt.ylabel("Individuals", fontsize=12)
    for c in x.containers:
        x.bar_label(c)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(13, 25))
sns.set(style='darkgrid')
risk_factors = ['PhysicalActivities', 'SmokerStatus', 'ECigaretteUsage', 'HadDiabetes', 'HadObesity']
for i, risk_factor in enumerate(risk_factors, 1):
    plt.subplot(5, 1, i)
    x = sns.countplot(data=df, x='AgeCategory', hue=risk_factor)
    plt.title(f"{risk_factor} Among Different Age Groups", fontsize=15)
    plt.xlabel("Age Category", fontsize=12)
    plt.ylabel("Individuals", fontsize=12)
    for c in x.containers:
        x.bar_label(c)
plt.tight_layout()
plt.show()


# **Handle outliers (IQR method, Z-score, boxplot())**

---



In [None]:
for col in df.describe().columns:
    sns.set_style('ticks')
    plt.figure(figsize=(16, 2))
    sns.boxplot(data=df, x=col)
    plt.show()

In [None]:
#Function for extracting outliers in column of dataframe
def get_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3-Q1

    lower_limit = Q1 - (IQR * 1.5)
    upper_limit = Q3 + (IQR * 1.5)

    outliers = df[(df[column] < lower_limit) | (df[column] > upper_limit)]

    return outliers, lower_limit, upper_limit

In [None]:
sleep_hours_outliers, lower_sleep, upper_sleep = get_outliers(df, 'SleepHours')
sleep_hours_outliers

In [None]:
print(f"Lower Limit:{lower_sleep})\nUpper Limit:{upper_sleep})")

In [None]:
# Dropping records with sleep less than 3 hours
df = df.drop(df[df['SleepHours'] < 3].index)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
df.info()

In [None]:
df.drop(columns=['LastCheckupTime'], inplace=True)


In [None]:
columns_to_drop_na = ['GeneralHealth','PhysicalActivities','HadAngina','HadStroke','HadAsthma','HadSkinCancer',
  'HadCOPD','HadDepressiveDisorder','HadKidneyDisease','HadArthritis','HadDiabetes','AgeCategory',
  'MentalHealthDays','PhysicalHealthDays','SleepHours']

df.dropna(subset=columns_to_drop_na, inplace=True)
df.isna().sum()/len(df)*100


In [None]:
columns_with_nulls = df.columns[df.isnull().any()]
columns_with_nulls

In [None]:
df.head()

In [None]:
columns_to_check = ['SmokerStatus', 'ECigaretteUsage', 'BMI', 'AlcoholDrinkers']

# Display unique values for each column
for col in columns_to_check:
    if col in df.columns:
        print(f"Unique values in {col}: {df[col].unique()}\n")

In [None]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_percentage.index, y=missing_percentage.values)
plt.xticks(rotation=90)
plt.title('Percentage of Missing Values in Each Column')
plt.ylabel('Percentage Missing')
plt.show()

In [None]:
print(df.isna().sum())  # Should print 0 for all columns

In [None]:
columns_to_fill = ['SmokerStatus', 'ECigaretteUsage', 'AlcoholDrinkers']
df[columns_to_fill] = df[columns_to_fill].fillna("Unknown")


In [None]:
print(df.groupby('HadHeartAttack')[['BMI']].mean())


In [None]:
import matplotlib.pyplot as plt

# Plot histograms
df[[ 'BMI']].hist(bins=30, figsize=(10, 4))

plt.show()


In [None]:
print(df[[ 'BMI']].skew())


In [None]:
df['BMI'] = np.log1p(df['BMI'])  # Log transformation


In [None]:
Q1 = df['BMI'].quantile(0.25)
Q3 = df['BMI'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['BMI'] < lower_bound) | (df['BMI'] > upper_bound)]
print(outliers)


In [None]:
df['BMI'] = np.clip(df['BMI'], lower_bound, upper_bound)


In [None]:
import seaborn as sns

# Plot boxplots
plt.figure(figsize=(10, 4))
sns.boxplot(data=df[[ 'BMI']])
plt.show()


In [None]:
df['BMI'] = df['BMI'].fillna(df['BMI'].median())


In [None]:
print(df.isna().sum())  # Should print 0 for all columns


# **Feature correlation analysis (heatmap())**


---



In [None]:



# Filter for numeric columns (int and float)
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numeric Features')
plt.show()


In [None]:

for col in df.describe(include='object').columns:
    print('Column Name: ',col)
    print(df[col].unique())
    print('-------------------------------------------------')


In [None]:
df.info()

# **Feature engineering (One-hot encoding, Label encoding)**


---



In [None]:

from sklearn.preprocessing import LabelEncoder


# Initialize LabelEncoders for binary categorical columns
sex_label = LabelEncoder()
physical_activities_label = LabelEncoder()
had_heart_attack_label = LabelEncoder()
had_angina_label = LabelEncoder()
had_stroke_label = LabelEncoder()
had_asthma_label = LabelEncoder()
had_skin_cancer_label = LabelEncoder()
had_copd_label = LabelEncoder()
had_depressive_disorder_label = LabelEncoder()
had_kidney_disease_label = LabelEncoder()
had_arthritis_label = LabelEncoder()
had_obesity_label = LabelEncoder()

# Fit the encoders
sex_label.fit(df["Sex"])
physical_activities_label.fit(df["PhysicalActivities"].astype(str))
had_heart_attack_label.fit(df["HadHeartAttack"].astype(str))
had_angina_label.fit(df["HadAngina"].astype(str))
had_stroke_label.fit(df["HadStroke"].astype(str))
had_asthma_label.fit(df["HadAsthma"].astype(str))
had_skin_cancer_label.fit(df["HadSkinCancer"].astype(str))
had_copd_label.fit(df["HadCOPD"].astype(str))
had_depressive_disorder_label.fit(df["HadDepressiveDisorder"].astype(str))
had_kidney_disease_label.fit(df["HadKidneyDisease"].astype(str))
had_arthritis_label.fit(df["HadArthritis"].astype(str))
had_obesity_label.fit(df["HadObesity"].astype(str))

# Transform the data
df["Sex"] = sex_label.transform(df["Sex"])
df["PhysicalActivities"] = physical_activities_label.transform(df["PhysicalActivities"].astype(str))
df["HadHeartAttack"] = had_heart_attack_label.transform(df["HadHeartAttack"].astype(str))
df["HadAngina"] = had_angina_label.transform(df["HadAngina"].astype(str))
df["HadStroke"] = had_stroke_label.transform(df["HadStroke"].astype(str))
df["HadAsthma"] = had_asthma_label.transform(df["HadAsthma"].astype(str))
df["HadSkinCancer"] = had_skin_cancer_label.transform(df["HadSkinCancer"].astype(str))
df["HadCOPD"] = had_copd_label.transform(df["HadCOPD"].astype(str))
df["HadDepressiveDisorder"] = had_depressive_disorder_label.transform(df["HadDepressiveDisorder"].astype(str))
df["HadKidneyDisease"] = had_kidney_disease_label.transform(df["HadKidneyDisease"].astype(str))
df["HadArthritis"] = had_arthritis_label.transform(df["HadArthritis"].astype(str))
df["HadObesity"] = had_obesity_label.transform(df["HadObesity"].astype(str))



# Display sample data
df.head()

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import OneHotEncoder




# Initialize OneHotEncoders for multi-category categorical features
general_health_ohe = OneHotEncoder(sparse_output=False, drop="first")
had_diabetes_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
smoker_status_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
e_cigarette_usage_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
age_category_ohe = OneHotEncoder(sparse_output=False, drop="first")
alcohol_drinkers_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Apply OneHotEncoding to each feature
general_health_encoded = general_health_ohe.fit_transform(df[["GeneralHealth"]])
had_diabetes_encoded = had_diabetes_ohe.fit_transform(df[["HadDiabetes"]])
smoker_status_encoded = smoker_status_ohe.fit_transform(df[["SmokerStatus"]])
e_cigarette_usage_encoded = e_cigarette_usage_ohe.fit_transform(df[["ECigaretteUsage"]])
age_category_encoded = age_category_ohe.fit_transform(df[["AgeCategory"]])
alcohol_drinkers_encoded = alcohol_drinkers_ohe.fit_transform(df[["AlcoholDrinkers"]])

# Convert encoded arrays into DataFrames with proper column names
general_health_df = pd.DataFrame(general_health_encoded, columns=general_health_ohe.get_feature_names_out(["GeneralHealth"]))
had_diabetes_df = pd.DataFrame(had_diabetes_encoded, columns=had_diabetes_ohe.get_feature_names_out(["HadDiabetes"]))
smoker_status_df = pd.DataFrame(smoker_status_encoded, columns=smoker_status_ohe.get_feature_names_out(["SmokerStatus"]))
e_cigarette_usage_df = pd.DataFrame(e_cigarette_usage_encoded, columns=e_cigarette_usage_ohe.get_feature_names_out(["ECigaretteUsage"]))
age_category_df = pd.DataFrame(age_category_encoded, columns=age_category_ohe.get_feature_names_out(["AgeCategory"]))
alcohol_drinkers_df = pd.DataFrame(alcohol_drinkers_encoded, columns=alcohol_drinkers_ohe.get_feature_names_out(["AlcoholDrinkers"]))

# Reset index before concatenation to ensure alignment
df = df.reset_index(drop=True)
general_health_df = general_health_df.reset_index(drop=True)
had_diabetes_df = had_diabetes_df.reset_index(drop=True)
smoker_status_df = smoker_status_df.reset_index(drop=True)
e_cigarette_usage_df = e_cigarette_usage_df.reset_index(drop=True)
age_category_df = age_category_df.reset_index(drop=True)
alcohol_drinkers_df = alcohol_drinkers_df.reset_index(drop=True)

# Concatenate original DataFrame with encoded features
df = pd.concat([
    df, general_health_df, had_diabetes_df, smoker_status_df,
    e_cigarette_usage_df, age_category_df,
    alcohol_drinkers_df
], axis=1)

# Drop original categorical columns
df.drop(columns=["GeneralHealth", "HadDiabetes", "SmokerStatus", "ECigaretteUsage",
                  "AgeCategory", "AlcoholDrinkers"], inplace=True)



# Display first few rows
df.head()


In [None]:

corr_matrix=df.corr()

target_corr=corr_matrix["HadHeartAttack"].sort_values(ascending=False)

print(target_corr)

x = df.drop(['HadHeartAttack'], axis=1)
y = df['HadHeartAttack']

In [None]:
# Find features with low absolute correlation (< 0.02)
low_corr_features = df.corr()["HadHeartAttack"].abs().sort_values()
low_corr_features = low_corr_features[low_corr_features < 0.02].index.tolist()

print("Features with very low correlation:", low_corr_features)


In [None]:
# Updated list of low-correlation features (excluding important ones)
low_corr_features = [
    'CovidPos_Unknown', 'ECigaretteUsage_Not at all',
    'AlcoholDrinkers_Unknown', 'SmokerStatus_Unknown', 'AgeCategory_Unknown',
    'SmokerStatus_Current smoker(Some days)', 'ECigaretteUsage_Unknown', 'GeneralHealth_Good', 'ECigaretteUsage_Never',
    'ECigaretteUsage_Somedays', 'ECigaretteUsage_Everyday'
]

# Drop only the columns that exist in the DataFrame (avoids KeyErrors)
df = df.drop(columns=[col for col in low_corr_features if col in df.columns])

# Print remaining columns to verify
print("Remaining Features after dropping low-correlation ones:", df.columns)


In [None]:

for col in df.describe().columns:
    print('Column Name: ',col)
    print(df[col].unique())
    print('-------------------------------------------------')


In [None]:
# Compute correlation of all features with the target
correlation = df.corr()['HadHeartAttack'].abs().sort_values(ascending=False)

# Display top correlated features
print("📊 Features with Highest Correlation:\n", correlation.head(20))


# **Sampling**


---



In [None]:
import pandas as pd

# Load your dataset (assuming df contains "HadHeartAttack" column)
class_0 = df[df["HadHeartAttack"] == 0].sample(n=30000, random_state=42)  # Take equal samples
class_1 = df[df["HadHeartAttack"] == 1]  # Keep all minority class samples

# Combine the balanced dataset
df_balanced = pd.concat([class_0, class_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
print(df_balanced["HadHeartAttack"].value_counts())



# **Train-Test Split (train_test_split())**


---



In [None]:
from sklearn.model_selection import train_test_split

# ✅ Separate features and target
x = df_balanced.drop(columns=["HadHeartAttack"])  # Features
y = df_balanced["HadHeartAttack"]  # Target variable

# ✅ Train-test split (80-20 split, stratified to maintain class balance)
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

# 🔹 Check new class distribution in train & test sets
print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())

# 🔹 Print shapes
print(f"X_train shape: {x_train.shape}, X_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")


# **Handling Imbalanced Data (SMOTE) and Feature selection**


---



In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# ✅ Apply SMOTE only on training data
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# ✅ Train a Random Forest model on SMOTE-applied data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train_smote, y_train_smote)

# ✅ Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=x_train.columns)

# ✅ Select the top 10 most important features
top_10_features = feature_importances.nlargest(10).index.tolist()

print("Top 10 Features:", top_10_features)


# **Standardization (StandardScaler)**


---



In [None]:
from sklearn.preprocessing import StandardScaler

# ✅ Initialize scaler
scaler = StandardScaler()

# ✅ Select only top 10 features from SMOTE-applied data
x_train_selected = x_train_smote[top_10_features]
x_test_selected = x_test[top_10_features]

# ✅ Fit scaler on training data and transform both train & test
x_train_scaled = scaler.fit_transform(x_train_selected)
x_test_scaled = scaler.transform(x_test_selected)

# ✅ Convert back to DataFrame
import pandas as pd
x_train_scaled = pd.DataFrame(x_train_scaled, columns=top_10_features)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=top_10_features)

print("Scaling complete! Your data is ready for model training. 🚀")


# **Hyperparameter Tuning(Random Search (RandomizedSearchCV))**


---



In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ✅ Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# ✅ Define hyperparameter grids
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2']},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    'Gradient Boosting': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']},
    'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    'AdaBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.5, 1.0]},
    'XGBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'subsample': [0.8, 1.0]}
}

# ✅ Store results
results = []

# ✅ Loop through models
for model_name, model in models.items():
    print(f"🔍 Training {model_name}...")

    # Check if model has hyperparameters to tune
    if model_name in param_grids:
        randomized_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grids[model_name],
            n_iter=10, cv=3, scoring='f1', random_state=42, n_jobs=-1, verbose=1
        )
    else:
        randomized_search = model  # No tuning for Naive Bayes

    # ✅ Fit model
    if model_name in param_grids:
        randomized_search.fit(x_train_scaled, y_train_smote)
        best_model = randomized_search.best_estimator_
        best_params = randomized_search.best_params_
    else:
        best_model.fit(x_train_scaled, y_train_smote)
        best_params = "N/A"

    # ✅ Predictions
    y_train_pred = best_model.predict(x_train_scaled)
    y_test_pred = best_model.predict(x_test_scaled)

    # ✅ Store metrics
    metrics = {
        'Model': model_name,
        'Best Params': best_params,
        'Train Accuracy': accuracy_score(y_train_smote, y_train_pred),
        'Test Accuracy': accuracy_score(y_test, y_test_pred),
        'Train Precision': precision_score(y_train_smote, y_train_pred),
        'Test Precision': precision_score(y_test, y_test_pred),
        'Train Recall': recall_score(y_train_smote, y_train_pred),
        'Test Recall': recall_score(y_test, y_test_pred),
        'Train F1': f1_score(y_train_smote, y_train_pred),
        'Test F1': f1_score(y_test, y_test_pred),
        'Confusion Matrix': confusion_matrix(y_test, y_test_pred)
    }

    results.append(metrics)

# ✅ Convert to DataFrame
results_df = pd.DataFrame(results)
print("\n✅ Model training & tuning complete! Here's the summary:\n")
print(results_df)


**1.LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
lr = LogisticRegression()

# Define hyperparameter grid
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}
#lr_param_grids = {
#    'C': np.logspace(-4, 4, 20),  # More granular values
#    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
#'penalty': ['l1', 'l2', 'elasticnet'],
#'l1_ratio': np.linspace(0, 1, 5)  # Only used for elasticnet
#



# Hyperparameter tuning
randomized_search_lr = RandomizedSearchCV(
    estimator=lr,
    param_distributions=lr_param_grid,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit model
randomized_search_lr.fit(x_train_scaled, y_train_smote)

# Get best model
lr_best_model = randomized_search_lr.best_estimator_

# Predictions
y_train_pred_log = lr_best_model.predict(x_train_scaled)
y_test_pred_log = lr_best_model.predict(x_test_scaled)



In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_log))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_log))

In [None]:
print(confusion_matrix(y_test, y_test_pred_log))

In [None]:
from sklearn.metrics import precision_recall_curve

y_probs = xgb_best_model.predict_proba(x_test_scaled)[:, 1]  # Get probability for class 1
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Compute F1-score for each threshold
f1_scores = (2 * precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]  # Choose threshold with best F1-score

# Apply new threshold
y_test_pred_adjusted = (y_probs >= best_threshold).astype(int)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_test_pred_adjusted))
print(confusion_matrix(y_test, y_test_pred_adjusted))


**2.RandomForestClassifier**

In [None]:
rf = RandomForestClassifier()
rf_param_grids = {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [10, 20, 30, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'bootstrap': [True, False]
               }

randomized_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grids,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1)

randomized_search_rf.fit(x_train_scaled, y_train_smote)

rf_best_model = randomized_search_rf.best_estimator_

y_train_pred_rf = rf_best_model.predict(x_train_scaled)
y_test_pred_rf = rf_best_model.predict(x_test_scaled)

In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_rf))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_rf))

In [None]:
print(confusion_matrix(y_test, y_test_pred_rf))

**3.Gradient boosting**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
gb = GradientBoostingClassifier()

# Define hyperparameter grid
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [5, 10],
    'subsample': [0.7, 0.8, 0.9]
}



# Hyperparameter tuning
randomized_search_gb = RandomizedSearchCV(
    estimator=gb,
    param_distributions=gb_param_grid,
    n_iter=30,  # More iterations for better tuning
    cv=5,  # Increase cross-validation folds for better generalization
    scoring='f1',  # Focus on F1-score for imbalanced data
    verbose=2,
    random_state=42,
    n_jobs=-1
)


# Fit model
randomized_search_gb.fit(x_train_scaled, y_train_smote)

# Get best model
gb_best_model = randomized_search_gb.best_estimator_

# Predictions
y_train_pred_gb = gb_best_model.predict(x_train_scaled)
y_test_pred_gb = gb_best_model.predict(x_test_scaled)




In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

# Get predicted probabilities for the positive class
y_probs = gb_best_model.predict_proba(x_test_scaled)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Compute F1-score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)  # Avoid division by zero

# Find the best threshold based on the highest F1-score
best_threshold = thresholds[np.argmax(f1_scores)]

# Apply the new threshold
y_test_pred_adjusted = (y_probs >= best_threshold).astype(int)

# Plot Precision-Recall vs. Threshold Curve
plt.figure(figsize=(8,6))
plt.plot(thresholds, precision[:-1], label='Precision', linestyle='--', marker='o')
plt.plot(thresholds, recall[:-1], label='Recall', linestyle='--', marker='o')
plt.plot(thresholds, f1_scores[:-1], label='F1-score', linestyle='-', marker='x', color='green')
plt.axvline(best_threshold, color='red', linestyle='dashed', label=f'Best Threshold = {best_threshold:.2f}')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision, Recall & F1-score vs Threshold")
plt.legend()
plt.grid()
plt.show()

# Evaluate results
print(f"Best Threshold: {best_threshold:.2f}\n")
print("Updated Classification Report:\n", classification_report(y_test, y_test_pred_adjusted))
print("Updated Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_adjusted))


In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_gb))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_gb))

In [None]:
print(confusion_matrix(y_test, y_test_pred_gb))

**4.GaussianNB**

In [None]:
gnb = GaussianNB()
gnb_param_grids = {
                  }

randomized_search_gnb = RandomizedSearchCV(
    estimator=gnb,
    param_distributions=gnb_param_grids,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1)

randomized_search_gnb.fit(x_train_scaled, y_train_smote)

gnb_best_model = randomized_search_gnb.best_estimator_

y_train_pred_gnb = gnb_best_model.predict(x_train_scaled)
y_test_pred_gnb = gnb_best_model.predict(x_test_scaled)

In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_gnb))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_gnb))

In [None]:
print(confusion_matrix(y_test, y_test_pred_gnb))

**5.KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model
knn = KNeighborsClassifier()

# Define hyperparameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Hyperparameter tuning
randomized_search_knn = RandomizedSearchCV(
    estimator=knn,
    param_distributions=knn_param_grid,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit model
randomized_search_knn.fit(x_train_scaled, y_train_smote)

# Get best model
knn_best_model = randomized_search_knn.best_estimator_

y_train_pred_knn = knn_best_model.predict(x_train_scaled)
y_test_pred_knn = knn_best_model.predict(x_test_scaled)


In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_knn))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_knn))

In [None]:
print(confusion_matrix(y_test, y_test_pred_knn))

**6.ADAboost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize AdaBoost classifier
abc = AdaBoostClassifier()

# Define hyperparameter grid (Removed 'algorithm')
abc_param_grids = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0, 1.5]
}

# Hyperparameter tuning with RandomizedSearchCV
randomized_search_abc = RandomizedSearchCV(
    estimator=abc,
    param_distributions=abc_param_grids,
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    error_score='raise'  # Raises errors instead of setting NaN scores
)

# Fit the model
randomized_search_abc.fit(x_train_scaled, y_train_smote)

# Get best model
abc_best_model = randomized_search_abc.best_estimator_

# Predictions
y_train_pred_ada = abc_best_model.predict(x_train_scaled)
y_test_pred_ada = abc_best_model.predict(x_test_scaled)



In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_ada))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_ada))

In [None]:
print(confusion_matrix(y_test, y_test_pred_ada))

**7.XGboost**

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define hyperparameter grid
xgb_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0, 0.01, 0.1, 1]
}

# Perform Randomized Search CV
randomized_search_xgb = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=xgb_param_grid,
    n_iter=10,  # Number of random combinations to try
    cv=3,       # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit model on resampled training data
randomized_search_xgb.fit(x_train_scaled, y_train_smote)

# Best model after hyperparameter tuning
xgb_best_model = randomized_search_xgb.best_estimator_

# Predictions
y_train_pred_xgb = xgb_best_model.predict(x_train_scaled)
y_test_pred_xgb = xgb_best_model.predict(x_test_scaled)


In [None]:
print('Training :\n',classification_report(y_train_smote, y_train_pred_xgb))

In [None]:
print('Testing :\n',classification_report(y_test, y_test_pred_xgb))

In [None]:
print(confusion_matrix(y_test, y_test_pred_xgb))

In [None]:
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix
import numpy as np

# Get probability scores
y_probs = xgb_best_model.predict_proba(x_test_scaled)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Find the best threshold using F1-score
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)  # Avoid division by zero
best_threshold = thresholds[np.argmax(f1_scores)]  # Select threshold that maximizes F1-score

# Apply new threshold
y_test_pred_adjusted = (y_probs >= best_threshold).astype(int)

# Evaluate
print("Testing Results After Threshold Adjustment:")
print(classification_report(y_test, y_test_pred_adjusted))
print(confusion_matrix(y_test, y_test_pred_adjusted))


In [None]:
import pickle


In [None]:
pickle.dump(xgb_best_model, open('xgb_best_model.sav', 'wb'))


In [None]:
pickle.dump(scaler, open('scaler.sav', 'wb'))

In [None]:
pickle.dump(had_angina_label, open('had_angina_label.sav', 'wb'))  # Label encoded


In [None]:
pickle.dump(age_category_ohe, open('age_category_ohe.sav', 'wb'))  # One-hot encoder


In [None]:
pickle.dump(had_diabetes_ohe, open('had_diabetes_ohe.sav', 'wb'))  # One-hot encoder


In [None]:

pickle.dump(sex_label, open('sex_label.sav', 'wb'))  # Label encoded

In [None]:
pickle.dump(had_arthritis_label, open('had_arthritis_label.sav', 'wb'))  # Label encoded

In [None]:
x_train_scaled.columns

In [None]:
x_train_scaled.head()

In [None]:
pip list


In [None]:
scaler.feature_names_in_

In [None]:
!python --version
