In [5]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from lightgbm import LGBMClassifier

# 📥 Load datasets
train_df = pd.read_csv("/kaggle/input/notebook/Train_Data.csv")
test_df = pd.read_csv("/kaggle/input/notebook/Test_Data.csv")

# 🧾 Save Test IDs and drop SEQN
test_ids = test_df["SEQN"].copy() if "SEQN" in test_df.columns else pd.Series(range(len(test_df)))
train_df.drop(columns=["SEQN"], inplace=True, errors='ignore')
test_df.drop(columns=["SEQN"], inplace=True, errors='ignore')

# ❌ Drop missing targets
train_df = train_df.dropna(subset=["age_group"])

# 🎯 Encode target: Adult → 0, Senior → 1
train_df["age_group"] = train_df["age_group"].map({"Adult": 0, "Senior": 1})

# 🧪 Feature Engineering
def feature_engineer(df):
    df = df.copy()
    df["bmi_glucose_ratio"] = df["BMXBMI"] / (df["LBXGLU"] + 1e-6)
    df["insulin_resistance"] = df["LBXIN"] * df["LBXGLU"] / 405
    df["bmi_category"] = pd.cut(df["BMXBMI"], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3])
    df["is_female"] = (df["RIAGENDR"] == 2).astype(int)
    df["PAQ605"] = df["PAQ605"].replace(7.0, 2.0)
    return df.drop(columns=["RIAGENDR"])

train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

# 🔢 Prepare X, y
X = train_df.drop("age_group", axis=1)
y = train_df["age_group"]
X_test = test_df.copy()

# 🧼 Imputation
num_cols = X.select_dtypes(include="number").columns
num_imputer = SimpleImputer(strategy="median")
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

# 🔄 Scaling
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# 🚫 Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
X = selector.fit_transform(X)
X_test = selector.transform(X_test)

# 📊 Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 🚀 Train LightGBM model
model = LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.05, random_state=42)
model.fit(X_train, y_train)

# 📈 Evaluate
train_acc = accuracy_score(y_train, model.predict(X_train))
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
val_f1 = f1_score(y_val, val_preds, average="weighted")

print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Validation Accuracy: {val_acc:.4f}")
print(f"✅ Validation F1 Score: {val_f1:.4f}")
print("\n📋 Classification Report:\n", classification_report(y_val, val_preds))

# 🔁 Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring="f1_weighted")
print(f"✅ CV F1 Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 🔮 Predict on test set
test_preds = model.predict(X_test)

# 💾 Save submission
submission = pd.DataFrame({
    "ID": test_ids,
    "age_group": test_preds
})
submission.to_csv("submission.csv", index=False)
print("\n✅ submission.csv saved successfully!")
print(submission.head())



[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1259
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.160794 -> initscore=-1.652329
[LightGBM] [Info] Start training from score -1.652329
✅ Training Accuracy: 0.9917
✅ Validation Accuracy: 0.8261
✅ Validation F1 Score: 0.7933

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90       328
           1       0.40      0.16      0.23        63

    accuracy                           0.83       391
   macro avg       0.63      0.56      0.56       391
weighted avg       0.78      0.83      0.79       391

[Lig