In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier

In [2]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/health-dataset/Train_Data.csv")
test_df = pd.read_csv("/kaggle/input/health-dataset/Test_Data.csv")


In [3]:
# Drop rows without target label
train_df = train_df.dropna(subset=['age_group']).reset_index(drop=True)

# Binary target: Adult=0, Senior=1
train_df['senior'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})

In [4]:
# Feature engineering: log-transform lab values and HOMA-IR
for df in [train_df, test_df]:
    # Create log features (add small constant for zeros)
    df['LBXGLU_log'] = np.log1p(df['LBXGLU'])
    df['LBXIN_log'] = np.log1p(df['LBXIN'])
    # HOMA-IR from logged values
    df['HOMA_IR'] = (np.expm1(df['LBXGLU_log']) * np.expm1(df['LBXIN_log'])) / 405
    # Missing indicators
    df['GLU_missing'] = df['LBXGLU'].isna().astype(int)
    df['IN_missing'] = df['LBXIN'].isna().astype(int)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
# Categorical encoding
for df in [train_df, test_df]:
    # DIQ010: 1=Yes, 2=No, 3=Borderline -> one-hot
    df['DIQ010'] = df['DIQ010'].replace({1: 'Yes', 2: 'No', 3: 'Borderline'})
    # RIAGENDR: 1=Male, 2=Female
    df['RIAGENDR'] = df['RIAGENDR'].map({1: 'Male', 2: 'Female'})
    # PAQ605: physical activity, treat 7/refused as 'Refused'
    df['PAQ605'] = df['PAQ605'].replace({7: 'Refused'})
    df['PAQ605'] = df['PAQ605'].astype('category')

# Select features
num_feats = ['BMXBMI', 'LBXGLU_log', 'LBXIN_log', 'HOMA_IR', 'GLU_missing', 'IN_missing']
cat_feats = ['DIQ010', 'RIAGENDR', 'PAQ605']

# Impute numeric
num_imputer = SimpleImputer(strategy='median')
train_df[num_feats] = num_imputer.fit_transform(train_df[num_feats])
test_df[num_feats] = num_imputer.transform(test_df[num_feats])

In [6]:
# One-hot encode categoricals
train_enc = pd.get_dummies(train_df[cat_feats], drop_first=False)
test_enc  = pd.get_dummies(test_df[cat_feats], drop_first=False)

# Align columns
train_enc, test_enc = train_enc.align(test_enc, join='left', axis=1, fill_value=0)

# Prepare final train and test matrices
X = pd.concat([train_df[num_feats], train_enc], axis=1)
y = train_df['senior']
X_test = pd.concat([test_df[num_feats], test_enc], axis=1)

In [7]:
# XGBoost with parameter tuning defaults
pos_ratio = y.value_counts()[0] / y.value_counts()[1]
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=pos_ratio,
    learning_rate=0.05,
    n_estimators=500,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Stratified CV probability estimates
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_proba = cross_val_predict(model, X, y, cv=skf, method='predict_proba')[:, 1]

# Find optimal threshold for F1
thresholds = np.linspace(0.1, 0.9, 81)
f1_scores = [f1_score(y, (y_proba >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Best CV F1: {best_f1:.4f} at threshold {best_thresh:.2f}")

Best CV F1: 0.3365 at threshold 0.10


In [8]:
# Train final model on full data
model.fit(X, y)

# Predict on test with threshold
test_proba = model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_thresh).astype(int)

# Build and save submission
submission = pd.DataFrame({'age_group': test_pred})
submission.to_csv("submission.csv", index=False)

# Report classification metrics on CV
y_pred_cv = (y_proba >= best_thresh).astype(int)
print("\nClassification Report on CV:")
print(classification_report(y, y_pred_cv, target_names=['Adult', 'Senior']))


Classification Report on CV:
              precision    recall  f1-score   support

       Adult       0.90      0.56      0.69      1638
      Senior       0.22      0.67      0.34       314

    accuracy                           0.58      1952
   macro avg       0.56      0.61      0.51      1952
weighted avg       0.79      0.58      0.63      1952

