# Health Prediction ML Pipeline


## Imports and Configuration


In [2]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import (
    precision_recall_curve, classification_report, f1_score
)
from sklearn.utils.class_weight import compute_class_weight

# Models
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    HistGradientBoostingClassifier
)
import xgboost as xgb

# Configuration
RANDOM_STATE = 42
TEST_SIZE = 0.3
DATA_PATH = "data/"


## Helper Functions


In [3]:
def load_and_clean_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    cols_all_nan_train = train.columns[train.isna().mean() == 1.0]
    cols_all_nan_test = test.columns[test.isna().mean() == 1.0]
    train = train.drop(columns=cols_all_nan_train)
    test = test.drop(columns=cols_all_nan_test)
    print(f"Train: {train.shape} | Test: {test.shape}")
    print(f"Removed {len(cols_all_nan_train)} columns")
    return train, test


def prepare_data(df, target_col='TARGET'):
    y = df[target_col]
    if y.dtype == bool or set(pd.unique(y)) <= {True, False}:
        y = y.astype(int)
    X = df.drop(columns=[target_col])
    return X, y


def create_preprocessor(X, encoding='ordinal'):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
    
    if encoding == 'ordinal':
        cat_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        cat_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), num_cols),
            ('cat', cat_transformer, cat_cols)
        ],
        remainder='drop'
    )
    print(f"Features: {len(num_cols)} numerical, {len(cat_cols)} categorical")
    return preprocessor


def find_best_threshold(y_true, y_proba):
    prec, rec, thresholds = precision_recall_curve(y_true, y_proba)
    f1_scores = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    best_idx = int(np.nanargmax(f1_scores))
    return float(thresholds[best_idx]), float(f1_scores[best_idx])


def evaluate_model(model, X_val, y_val, model_name='Model'):
    y_proba = model.predict_proba(X_val)[:, 1]
    best_thr, best_f1 = find_best_threshold(y_val, y_proba)
    y_pred = (y_proba >= best_thr).astype(int)
    print(f"\n{model_name}: threshold={best_thr:.4f}, F1={best_f1:.4f}")
    print(classification_report(y_val, y_pred, digits=3))
    return best_thr, best_f1


def export_predictions(model, X_test, test_df, threshold, filename):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    id_col = 'ID' if 'ID' in test_df.columns else test_df.columns[-1]
    output = pd.DataFrame({id_col: test_df[id_col], 'pred': y_pred})
    output.to_csv(filename, index=False)
    print(f"Predictions saved: {filename}")
    return output


## Data Loading


In [4]:
train_df, test_df = load_and_clean_data(f"{DATA_PATH}train.csv", f"{DATA_PATH}test.csv")
print(f"Features: {train_df.shape[1]}, Samples: {train_df.shape[0]}")


Train: (225000, 322) | Test: (75000, 321)
Removed 3 columns
Features: 322, Samples: 225000


In [5]:
target_col = 'TARGET'
target_counts = train_df[target_col].value_counts()
print(f"Target: 0={target_counts[0]:,}, 1={target_counts[1]:,}, ratio={target_counts[1]/target_counts.sum():.2%}")


Target: 0=204,861, 1=20,139, ratio=8.95%


  print(f"Target: 0={target_counts[0]:,}, 1={target_counts[1]:,}, ratio={target_counts[1]/target_counts.sum():.2%}")


In [6]:
df_num = train_df.apply(pd.to_numeric, errors='coerce')
corr_with_target = df_num.corr()[target_col].drop(target_col).abs().sort_values(ascending=False)
print("Top 10 correlated features:")
print(corr_with_target.head(10))


Top 10 correlated features:
_AGE80      0.231371
_AGEG5YR    0.222521
GENHLTH     0.222041
_AGE_G      0.214506
COLGSEX1    0.206725
EMPLOY1     0.203364
_HCVU652    0.194126
_AGE65YR    0.184403
_DRDXAR2    0.169732
_PACKYRS    0.168462
Name: TARGET, dtype: float64


## Model Training


In [7]:
X_all, y_all = prepare_data(train_df, target_col)
X_test_final = test_df.drop(columns=[target_col], errors='ignore')
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=TEST_SIZE, stratify=y_all, random_state=RANDOM_STATE)
print(f"Split: {X_train.shape[0]:,} train, {X_val.shape[0]:,} validation")


Split: 157,500 train, 67,500 validation


### Setup


In [8]:
models_results = {}
classes = np.array([0, 1])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
sample_weights = y_train.map(dict(zip(classes, class_weights))).values


In [9]:
print("Training Random Forest...")
preprocessor_rf = create_preprocessor(X_train, encoding='onehot')
rf_model = Pipeline([
    ('preprocessor', preprocessor_rf),
    ('classifier', RandomForestClassifier(
        n_estimators=300, max_depth=12, min_samples_leaf=2, max_features='sqrt',
        class_weight='balanced_subsample', n_jobs=-1, random_state=RANDOM_STATE
    ))
])
rf_model.fit(X_train, y_train)
rf_threshold, rf_f1 = evaluate_model(rf_model, X_val, y_val, model_name='RandomForest')
models_results['RandomForest'] = (rf_model, rf_threshold, rf_f1)


Training Random Forest...
Features: 321 numerical, 0 categorical

RandomForest: threshold=0.6090, F1=0.3932
              precision    recall  f1-score   support

           0      0.950     0.890     0.919     61458
           1      0.316     0.520     0.393      6042

    accuracy                          0.856     67500
   macro avg      0.633     0.705     0.656     67500
weighted avg      0.893     0.856     0.872     67500



In [10]:
print("Training Extra Trees...")
preprocessor_et = create_preprocessor(X_train, encoding='ordinal')
et_model = Pipeline([
    ('preprocessor', preprocessor_et),
    ('classifier', ExtraTreesClassifier(
        n_estimators=400, max_depth=18, min_samples_leaf=2, max_features='sqrt',
        class_weight='balanced_subsample', bootstrap=True, max_samples=0.8,
        n_jobs=-1, random_state=RANDOM_STATE
    ))
])
et_model.fit(X_train, y_train)
et_threshold, et_f1 = evaluate_model(et_model, X_val, y_val, model_name='ExtraTrees')
models_results['ExtraTrees'] = (et_model, et_threshold, et_f1)


Training Extra Trees...
Features: 321 numerical, 0 categorical

ExtraTrees: threshold=0.5203, F1=0.3609
              precision    recall  f1-score   support

           0      0.952     0.847     0.896     61458
           1      0.265     0.564     0.361      6042

    accuracy                          0.821     67500
   macro avg      0.609     0.705     0.628     67500
weighted avg      0.890     0.821     0.848     67500



In [11]:
print("Training Histogram Gradient Boosting...")
preprocessor_hgb = create_preprocessor(X_train, encoding='ordinal')
hgb_model = Pipeline([
    ('preprocessor', preprocessor_hgb),
    ('classifier', HistGradientBoostingClassifier(
        max_iter=300, learning_rate=0.06, max_depth=6, min_samples_leaf=20,
        l2_regularization=1.0, early_stopping=True, validation_fraction=0.1,
        random_state=RANDOM_STATE
    ))
])
hgb_model.fit(X_train, y_train, classifier__sample_weight=sample_weights)
hgb_threshold, hgb_f1 = evaluate_model(hgb_model, X_val, y_val, model_name='HistGradientBoosting')
models_results['HistGradientBoosting'] = (hgb_model, hgb_threshold, hgb_f1)


Training Histogram Gradient Boosting...
Features: 321 numerical, 0 categorical

HistGradientBoosting: threshold=0.7343, F1=0.4177
              precision    recall  f1-score   support

           0      0.950     0.905     0.927     61458
           1      0.350     0.519     0.418      6042

    accuracy                          0.871     67500
   macro avg      0.650     0.712     0.672     67500
weighted avg      0.897     0.871     0.882     67500



In [12]:
print("Training XGBoost...")
preprocessor_xgb = create_preprocessor(X_train, encoding='ordinal')
X_train_xgb = preprocessor_xgb.fit_transform(X_train)
X_val_xgb = preprocessor_xgb.transform(X_val)

scale_pos_weight = (y_train == 0).sum() / max((y_train == 1).sum(), 1)
dtrain = xgb.DMatrix(X_train_xgb, label=y_train)
dval = xgb.DMatrix(X_val_xgb, label=y_val)

params = {
    'objective': 'binary:logistic', 'eval_metric': 'aucpr',
    'eta': 0.03, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight, 'seed': RANDOM_STATE
}

xgb_model = xgb.train(params, dtrain, num_boost_round=2000, evals=[(dtrain, 'train'), (dval, 'eval')], 
                      early_stopping_rounds=100, verbose_eval=False)

y_proba_xgb = xgb_model.predict(dval)
xgb_threshold, xgb_f1 = find_best_threshold(y_val, y_proba_xgb)
y_pred_xgb = (y_proba_xgb >= xgb_threshold).astype(int)
print(f"XGBoost: threshold={xgb_threshold:.4f}, F1={xgb_f1:.4f}")
print(classification_report(y_val, y_pred_xgb, digits=3))

models_results['XGBoost'] = (xgb_model, xgb_threshold, xgb_f1, preprocessor_xgb)


Training XGBoost...
Features: 321 numerical, 0 categorical
XGBoost: threshold=0.7276, F1=0.4159
              precision    recall  f1-score   support

           0      0.949     0.910     0.929     61458
           1      0.355     0.502     0.416      6042

    accuracy                          0.874     67500
   macro avg      0.652     0.706     0.673     67500
weighted avg      0.896     0.874     0.883     67500



### Model Selection


In [13]:
best_model_name = max(models_results.keys(), key=lambda k: models_results[k][2])
best_model_info = models_results[best_model_name]

print(f"\nBest model: {best_model_name} (F1={best_model_info[2]:.4f}, threshold={best_model_info[1]:.4f})")
print("\nAll results:")
for name, (_, threshold, f1, *_) in models_results.items():
    print(f"  {name:20} F1={f1:.4f}  threshold={threshold:.4f}")



Best model: HistGradientBoosting (F1=0.4177, threshold=0.7343)

All results:
  RandomForest         F1=0.3932  threshold=0.6090
  ExtraTrees           F1=0.3609  threshold=0.5203
  HistGradientBoosting F1=0.4177  threshold=0.7343
  XGBoost              F1=0.4159  threshold=0.7276


## Predictions


In [14]:
best_model = best_model_info[0]
best_threshold = best_model_info[1]

if best_model_name == 'XGBoost':
    preprocessor = best_model_info[3]
    X_test_transformed = preprocessor.transform(X_test_final)
    dtest = xgb.DMatrix(X_test_transformed)
    y_test_pred = (best_model.predict(dtest) >= best_threshold).astype(int)
    id_col = 'ID' if 'ID' in test_df.columns else test_df.columns[-1]
    output = pd.DataFrame({id_col: test_df[id_col], 'pred': y_test_pred})
    output.to_csv('predictions_final.csv', index=False)
else:
    output = export_predictions(best_model, X_test_final, test_df, best_threshold, 'predictions_final.csv')

print(f"Saved predictions_final.csv using {best_model_name}")
print(output.head(10))


Predictions saved: predictions_final.csv
Saved predictions_final.csv using HistGradientBoosting
       ID  pred
0  225000     0
1  225001     0
2  225002     0
3  225003     0
4  225004     0
5  225005     0
6  225006     0
7  225007     0
8  225008     0
9  225009     1


## Summary

Minimalistic pipeline for health prediction:
- Data loading and preprocessing
- Model training (RandomForest, ExtraTrees, HistGradientBoosting, XGBoost)
- Threshold optimization for F1 score
- Model comparison and prediction export
