# Columns Description


- HighBP - Binary - 0 = no high BP 1 = high BP
- HighChol - Binary- 0 = no high cholesterol 1 = high cholesterol
- CholCheck - Binary - 0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
- BMI - Integer - Body Mass Index
- Smoker - Binary - Have you smoked at least 100 cigarettes in your entire life? 0 = no 1 = yes
- Stroke - Binary - (Ever told) you had a stroke. 0 = no 1 = yes
- HeartDiseaseorAttack - Binary - coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
- PhysActivity - Binary - physical activity in past 30 days - not including job 0 = no 1 = yes
- Fruits - Binary - Consume Fruit 1 or more times per day 0 = no 1 = yes
- Veggies - Binary - Consume Vegetables 1 or more times per day 0 = no 1 = yes
- HvyAlcoholConsump - Binary - Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week) 0 = no 1 = yes
- AnyHealthcare - Binary - Have any kind of health care coverage, including health insurance, prepaid plans such as HMO, etc. 0 = no 1 = yes
- NoDocbcCost -Binary - Was there a time in the past 12 months when you needed to see a doctor but could not because of cost? 0 = no 1 = yes
- GenHlth - Integer - Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
- MentHlth - Integer - Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good? scale 1-30 days
- PhysHlth - Integer - Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? scale 1-30 days
- DiffWalk - Binary - Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
- Gender - Binary - Female, Male


# Imports


In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Input, BatchNormalization, Dropout
from keras.optimizers import Adam, AdamW
from keras.regularizers import L1, L2

from sklearn.model_selection import train_test_split
import mlflow
import shap

# Model tracking


In [81]:
mlflow.tensorflow.autolog(silent=True)  # type: ignore # That's it! 🎉

mlflow.set_tracking_uri("file:./mlruns")

# Define an experiment name
mlflow.set_experiment("AML Competition 3")

import absl.logging

absl.logging.set_verbosity(absl.logging.ERROR)

In [82]:
# Define column groups (similar to the notebook)
categorical_features = ["Gender"]
binary_features = [
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "DiffWalk",
]
continuous_for_binning = ["BMI", "Age"]
ordinal_features_map = {
    "GenHlth": {1.0: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4}
}  # Example mapping, adjust as needed
numerical_features_to_scale = [
    "MentHlth",
    "PhysHlth",
    "Education",
    "Income",
]  # Original numerical features + custom ones after creation


# Custom domain-specific feature creation function (adapted from notebook)
def create_domain_features_pandas(X):
    """Create domain-specific health features using pandas."""
    new_features = pd.DataFrame(index=X.index)

    # Health risk score (sum of major risk factors)
    risk_factors = ["HighBP", "HighChol", "Smoker", "Stroke", "HeartDiseaseorAttack"]
    # Ensure columns exist before summing, handle potential missing columns gracefully
    valid_risk_factors = [col for col in risk_factors if col in X.columns]
    if valid_risk_factors:
        new_features["health_risk_score"] = X[valid_risk_factors].sum(axis=1)
    else:
        new_features["health_risk_score"] = 0

    # Lifestyle score (healthy behaviors - unhealthy behaviors)
    healthy_behaviors = ["PhysActivity", "Fruits", "Veggies"]
    unhealthy_behaviors = ["Smoker", "HvyAlcoholConsump"]
    valid_healthy = [col for col in healthy_behaviors if col in X.columns]
    valid_unhealthy = [col for col in unhealthy_behaviors if col in X.columns]

    score = 0
    if valid_healthy:
        score += X[valid_healthy].sum(axis=1)
    if valid_unhealthy:
        score -= X[valid_unhealthy].sum(axis=1)
    new_features["lifestyle_score"] = score

    # Health days interaction
    if "MentHlth" in X.columns and "PhysHlth" in X.columns:
        new_features["total_health_days"] = X["MentHlth"] + X["PhysHlth"]
        new_features["has_health_issues"] = (
            new_features["total_health_days"] > 0
        ).astype(int)
    else:
        new_features["total_health_days"] = 0
        new_features["has_health_issues"] = 0

    return new_features


# Main preprocessing function without sklearn
def preprocess_data_pandas(df_raw):
    """
    Preprocesses the input DataFrame using pandas and numpy,
    mimicking the sklearn pipeline steps.
    """
    df = df_raw.copy()
    processed_parts = []

    # 1. One-Hot Encode Categorical Features
    if categorical_features:
        # Check if features exist in the dataframe
        valid_categorical = [col for col in categorical_features if col in df.columns]
        if valid_categorical:
            one_hot_encoded = pd.get_dummies(
                df[valid_categorical],
                columns=valid_categorical,
                prefix=valid_categorical,
                drop_first=False,
                dummy_na=False,
            )  # Keep all categories, don't drop first to match sparse=False, handle_unknown='ignore' implicitly
            processed_parts.append(one_hot_encoded)
            # Drop original categorical columns
            df = df.drop(columns=valid_categorical)

    # 2. Bin Continuous Features (Quantile Binning + One-Hot)
    n_bins = 5
    for col in continuous_for_binning:
        if col in df.columns:
            try:
                # Quantile binning
                binned_col, _ = pd.qcut(
                    df[col], q=n_bins, labels=False, retbins=True, duplicates="drop"
                )
                binned_col.name = f"{col}_binned"
                # One-hot encode the bins
                binned_one_hot = pd.get_dummies(
                    binned_col, prefix=f"{col}_bin", drop_first=False, dummy_na=False
                )
                processed_parts.append(binned_one_hot)
            except Exception as e:
                print(f"Warning: Could not bin column '{col}'. Error: {e}")
                # Optionally keep the original column if binning fails, or handle differently
                # processed_parts.append(df[[col]]) # Keep original if needed
            finally:
                # Drop original binned column regardless of success/failure if it exists
                if col in df.columns:
                    df = df.drop(columns=[col])

    # 3. Ordinal Encode Features
    for col, mapping in ordinal_features_map.items():
        if col in df.columns:
            ordinal_encoded = (
                df[col].map(mapping).fillna(-1)
            )  # Use -1 or another value for unknowns/NaNs
            ordinal_encoded.name = f"{col}_encoded"
            processed_parts.append(ordinal_encoded)
            df = df.drop(columns=[col])

    # 4. Keep Binary Features (Passthrough)
    valid_binary = [col for col in binary_features if col in df.columns]
    if valid_binary:
        processed_parts.append(df[valid_binary])
        df = df.drop(columns=valid_binary)  # Drop after adding to processed parts

    # 5. Keep Original Numerical Features (will be scaled later)
    # Ensure we only try to select columns that actually exist
    valid_numerical_original = [
        col for col in numerical_features_to_scale if col in df.columns
    ]
    numerical_original_df = df[
        valid_numerical_original
    ].copy()  # Keep these aside for now
    if valid_numerical_original:
        df = df.drop(columns=valid_numerical_original)  # Drop from main df

    # --- Create Custom Features ---
    # Select columns needed for custom features - ensure they exist
    cols_for_custom = [
        col
        for col in binary_features
        + numerical_features_to_scale
        + list(ordinal_features_map.keys())
        + continuous_for_binning
        if col in df_raw.columns  # Check against original df
    ]
    custom_features_df = create_domain_features_pandas(df_raw[cols_for_custom])

    # --- Combine all parts before scaling ---
    df_combined = pd.concat(
        processed_parts + [numerical_original_df, custom_features_df], axis=1
    )

    # 6. Standard Scale Numerical and Custom Features
    # Identify all numerical columns to scale *after* custom features are added
    cols_to_scale = list(numerical_original_df.columns) + list(
        custom_features_df.columns
    )
    valid_cols_to_scale = [col for col in cols_to_scale if col in df_combined.columns]

    if valid_cols_to_scale:
        # Calculate mean and std deviation (add small epsilon to std dev for stability)
        means = df_combined[valid_cols_to_scale].mean()
        stds = (
            df_combined[valid_cols_to_scale].std() + 1e-8
        )  # Add epsilon to avoid division by zero

        # Apply scaling
        df_combined[valid_cols_to_scale] = (
            df_combined[valid_cols_to_scale] - means
        ) / stds

    # Handle any remaining columns (e.g., ID column if not dropped earlier, or columns missed)
    # In this version, we assume unhandled columns are dropped implicitly by not selecting them.
    # If you need to keep other columns, adjust the logic.

    return df_combined

In [83]:
from sklearn.preprocessing import StandardScaler


X_train_raw = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")
X_train = X_train_raw.drop("ID", axis=1)

# Extract the correct target variable and encode it to numeric (No=0, Yes=1)
y = y_train["Diabetes"].map({"No": 0, "Yes": 1})

X_train.drop(columns=["Gender"], inplace=True)
# Save feature names before converting to numpy so we can
feature_names = X_train.columns.tolist()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# X_train = preprocess_data_pandas(X_train)

test_size = 0.2
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y, test_size=test_size, random_state=42
)

regularizer = None
act = "relu"
model = Sequential(
    [
        Input(shape=(X_train_split.shape[1],)),
        # Dense(1024, activation=act, kernel_regularizer=regularizer),
        # Dense(512, activation=act, kernel_regularizer=regularizer),
        # Dense(256, activation=act, kernel_regularizer=regularizer),
        Dense(128, activation=act, kernel_regularizer=regularizer),
        Dense(64, activation=act, kernel_regularizer=regularizer),
        Dense(1, activation="sigmoid"),
    ]
)

compile_loss = "binary_crossentropy"
compile_metric = "accuracy"
model.compile(
    optimizer=Adam(),  # type: ignore
    loss=compile_loss,
    metrics=[compile_metric],
)

model_name = ""
with mlflow.start_run() as run:
    history = model.fit(
        X_train_split,
        y_train_split,
        epochs=40,
        batch_size=64,
        callbacks=[
            EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
        ],
        validation_data=(X_val_split, y_val_split),
    )

    test_loss, test_acc = model.evaluate(X_val_split, y_val_split)
    print(f"Test accuracy: {test_acc:.3f}")
    print(f"Test loss: {test_loss:.3f}")

    mlflow.log_param("test_size", test_size)
    mlflow.log_param("compiler_loss", compile_loss)
    mlflow.log_param("compiler_metric", compile_metric)

Epoch 1/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7492 - loss: 0.5109 - val_accuracy: 0.7486 - val_loss: 0.5110
Epoch 2/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7492 - loss: 0.5109 - val_accuracy: 0.7486 - val_loss: 0.5110
Epoch 2/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 999us/step - accuracy: 0.7546 - loss: 0.4990 - val_accuracy: 0.7520 - val_loss: 0.5052
Epoch 3/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 999us/step - accuracy: 0.7546 - loss: 0.4990 - val_accuracy: 0.7520 - val_loss: 0.5052
Epoch 3/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 984us/step - accuracy: 0.7562 - loss: 0.4965 - val_accuracy: 0.7513 - val_loss: 0.5050
Epoch 4/40
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 984us/step - accuracy: 0.7562 - loss: 0.4965 - val_accuracy: 0.7513 - val_loss: 0.5050
Epoch 4/40
[1m663/6



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 503us/step - accuracy: 0.7521 - loss: 0.5047
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 503us/step - accuracy: 0.7521 - loss: 0.5047
Test accuracy: 0.752
Test loss: 0.505
Test accuracy: 0.752
Test loss: 0.505
