In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score
from xgboost import XGBClassifier

In [2]:
# -----------------------------------
# Data Extraction
# -----------------------------------
X = pd.read_csv('training_set_features.csv',index_col='respondent_id') 
y = pd.read_csv('training_set_labels.csv',index_col='respondent_id')
pd.set_option("display.max_columns", 100)

In [3]:
# -----------------------------------
# Define feature types
# -----------------------------------
numeric_features = [
    'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
    'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
    'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face',
    'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition',
    'child_under_6_months', 'health_worker', 'health_insurance',
    'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',
    'household_adults', 'household_children'
]

ordinal_features = ['education', 'income_poverty']
ordinal_categories = [['< 12 Years', '12 Years', 'Some College', 'College Graduate'],
                      ['Below Poverty', '<= $75,000, Above Poverty' ,'> $75,000']]

nominal_features = [
    'age_group', 'race', 'sex', 'marital_status',
    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
    'employment_industry', 'employment_occupation'
]

In [4]:
# -----------------------------------
# Preprocessing pipeline
# -----------------------------------
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('ord', ordinal_pipeline, ordinal_features),
    ('nom', nominal_pipeline, nominal_features)
])

In [5]:
# -----------------------------------
# Train/Test split
# -----------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y['h1n1_vaccine'], random_state=42
)

In [6]:
# -----------------------------------
# Optuna tuning with cross_val_score
# -----------------------------------
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    model_h1n1 = Pipeline([
        ('preprocessing', preprocessor),
        ('clf', XGBClassifier(**params))
    ])
    model_seasonal = Pipeline([
        ('preprocessing', preprocessor),
        ('clf', XGBClassifier(**params))
    ])

    auc_h1 = cross_val_score(model_h1n1, X_train, y_train['h1n1_vaccine'], cv=skf, scoring='roc_auc').mean()
    auc_se = cross_val_score(model_seasonal, X_train, y_train['seasonal_vaccine'], cv=skf, scoring='roc_auc').mean()

    return (auc_h1 + auc_se) / 2

In [7]:
# -----------------------------------
# Run Optuna search
# -----------------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)
best_params = study.best_params

[I 2025-06-21 19:33:56,236] A new study created in memory with name: no-name-9e9d1720-e215-42ed-81aa-7adb113140e1
[I 2025-06-21 19:35:20,810] Trial 0 finished with value: 0.86111583260454 and parameters: {'n_estimators': 263, 'max_depth': 4, 'learning_rate': 0.014612828424548438, 'subsample': 0.9312047917128816, 'colsample_bytree': 0.6237667885557325, 'gamma': 3.525928073436158, 'reg_lambda': 0.31683478882008786, 'reg_alpha': 0.37477836011063026}. Best is trial 0 with value: 0.86111583260454.
[I 2025-06-21 19:38:53,605] Trial 1 finished with value: 0.8550177961029899 and parameters: {'n_estimators': 225, 'max_depth': 10, 'learning_rate': 0.28915014401018646, 'subsample': 0.6282126455892263, 'colsample_bytree': 0.8174092773188386, 'gamma': 3.704070061648582, 'reg_lambda': 4.814561918972462, 'reg_alpha': 1.8257859780974517}. Best is trial 0 with value: 0.86111583260454.
[I 2025-06-21 19:40:07,812] Trial 2 finished with value: 0.8659815669100088 and parameters: {'n_estimators': 104, 'max_

[I 2025-06-21 20:15:45,017] Trial 22 finished with value: 0.8659847890294456 and parameters: {'n_estimators': 122, 'max_depth': 6, 'learning_rate': 0.04640302378044879, 'subsample': 0.6892493480552125, 'colsample_bytree': 0.8502789368352034, 'gamma': 2.9540019680053864, 'reg_lambda': 3.0941198473884506, 'reg_alpha': 2.6769442200023406}. Best is trial 12 with value: 0.866927230492754.
[I 2025-06-21 20:17:24,847] Trial 23 finished with value: 0.8663208444305232 and parameters: {'n_estimators': 162, 'max_depth': 7, 'learning_rate': 0.09474759315705908, 'subsample': 0.7421397173996406, 'colsample_bytree': 0.8013289769934355, 'gamma': 2.28276724351103, 'reg_lambda': 3.9609725522218695, 'reg_alpha': 3.5705129642771984}. Best is trial 12 with value: 0.866927230492754.
[I 2025-06-21 20:20:15,552] Trial 24 finished with value: 0.863330450167511 and parameters: {'n_estimators': 207, 'max_depth': 9, 'learning_rate': 0.16268177005790657, 'subsample': 0.8355934565044936, 'colsample_bytree': 0.94432

Best params: {'n_estimators': 147, 'max_depth': 8, 'learning_rate': 0.07951654984477956, 'subsample': 0.7089171665291802, 'colsample_bytree': 0.8575052170741511, 'gamma': 2.717372569126357, 'reg_lambda': 4.354361264346143, 'reg_alpha': 3.1318037720056555}


In [8]:
# -----------------------------------
# Final model trained on training set
# -----------------------------------
final_model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', MultiOutputClassifier(XGBClassifier(
        **best_params,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )))
])
final_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavio

In [9]:
# -----------------------------------
# Threshold tuning using cross_val_predict
# -----------------------------------
def find_best_threshold(y_true, probs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, probs)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_index = np.argmax(f1_scores)
    return thresholds[best_index]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_h1n1 = Pipeline([
    ('preprocessing', preprocessor),
    ('clf', XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42))
])
model_seasonal = Pipeline([
    ('preprocessing', preprocessor),
    ('clf', XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42))
])

h1n1_probs = cross_val_predict(model_h1n1, X_train, y_train['h1n1_vaccine'], cv=skf, method='predict_proba')[:, 1]
seasonal_probs = cross_val_predict(model_seasonal, X_train, y_train['seasonal_vaccine'], cv=skf, method='predict_proba')[:, 1]

best_thresh_h1n1 = find_best_threshold(y_train['h1n1_vaccine'], h1n1_probs)
best_thresh_seasonal = find_best_threshold(y_train['seasonal_vaccine'], seasonal_probs)

print(f"Tuned thresholds → h1n1: {best_thresh_h1n1:.4f}, seasonal: {best_thresh_seasonal:.4f}")

Tuned thresholds → h1n1: 0.3237, seasonal: 0.3561


In [10]:
# -----------------------------------
# Evaluate on validation set
# -----------------------------------
val_probs = final_model.predict_proba(X_val)
val_h1 = (val_probs[0][:, 1] >= best_thresh_h1n1).astype(int)
val_se = (val_probs[1][:, 1] >= best_thresh_seasonal).astype(int)

print("Validation ROC AUC")
print("h1n1:", roc_auc_score(y_val['h1n1_vaccine'], val_probs[0][:, 1]))
print("seasonal:", roc_auc_score(y_val['seasonal_vaccine'], val_probs[1][:, 1]))

Validation ROC AUC
h1n1: 0.864086392618135
seasonal: 0.8645521863363819


In [12]:
# Extract TEST Data
X_test = pd.read_csv('test_set_features.csv',index_col='respondent_id')

In [13]:
# -----------------------------------
# Retrain on full data and predict on test
# -----------------------------------
final_model.fit(X, y)
test_probs = final_model.predict_proba(X_test)
final_h1 = (test_probs[0][:, 1] >= best_thresh_h1n1).astype(int)
final_se = (test_probs[1][:, 1] >= best_thresh_seasonal).astype(int)

In [14]:
# -----------------------------------
# Predict on Test Data for final submission
# -----------------------------------

# Submission template
submission_df = pd.read_csv("./submission_format _xgboost.csv", 
                            index_col="respondent_id")

# Make sure we have the rows in the same order
np.testing.assert_array_equal(X_test.index.values, 
                              submission_df.index.values)

In [15]:
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = final_h1
submission_df["seasonal_vaccine"] = final_se

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0,0
26708,0,0
26709,0,1
26710,1,1
26711,1,1


In [16]:
submission_df.to_csv('submission_format _xgboost.csv', index=True)

In [18]:
df = pd.read_csv('submission_format _xgboost.csv')
print(df.head(5))
print(df.tail(5))

   respondent_id  h1n1_vaccine  seasonal_vaccine
0          26707             0                 0
1          26708             0                 0
2          26709             0                 1
3          26710             1                 1
4          26711             1                 1
       respondent_id  h1n1_vaccine  seasonal_vaccine
26703          53410             0                 1
26704          53411             0                 0
26705          53412             0                 0
26706          53413             0                 0
26707          53414             1                 1
