# Feature Engineering — Frequency Encoding + K-Fold Target Encoding

Goal:
- Create leak-safe encodings
  - Frequency encoding (train+test combined)
  - K-fold target encoding (OOF for train, global for test)
- Train LightGBM on engineered features
- Save OOF/test preds + submission


imports + load

In [1]:
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb


In [2]:
ROOT = Path.cwd().resolve().parents[0]  # notebooks/ -> repo root
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

id_col = sub.columns[0]
target_col = sub.columns[1]

y = (train[target_col] == "Presence").astype(int).values

cat_cols = ['Sex', 'FBS over 120', 'Exercise angina', 'EKG results',
            'Slope of ST', 'Thallium', 'Number of vessels fluro', 'Chest pain type']
num_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
feature_cols = cat_cols + num_cols

X_raw = train[feature_cols].copy()
X_test_raw = test[feature_cols].copy()

print("train:", X_raw.shape, "test:", X_test_raw.shape, "pos_rate:", y.mean())


train: (630000, 13) test: (270000, 13) pos_rate: 0.44833968253968254


Frequency encoding (safe to use train+test together)

Idea: for each column value, compute how common it is in the overall population.
This often helps tree models because it gives them a smooth numeric signal.

In [3]:
def freq_encode(train_df: pd.DataFrame, test_df: pd.DataFrame, cols: list[str]):
    all_df = pd.concat([train_df[cols], test_df[cols]], axis=0, ignore_index=True)
    tr_out = pd.DataFrame(index=train_df.index)
    te_out = pd.DataFrame(index=test_df.index)

    for c in cols:
        freq = all_df[c].value_counts(normalize=True)  # fraction
        tr_out[c + "_freq"] = train_df[c].map(freq).astype(float)
        te_out[c + "_freq"] = test_df[c].map(freq).astype(float)

    return tr_out, te_out

tr_freq, te_freq = freq_encode(X_raw, X_test_raw, feature_cols)
print("freq features:", tr_freq.shape)
tr_freq.head()


freq features: (630000, 13)


Unnamed: 0,Sex_freq,FBS over 120_freq,Exercise angina_freq,EKG results_freq,Slope of ST_freq,Thallium_freq,Number of vessels fluro_freq,Chest pain type_freq,Age_freq,BP_freq,Cholesterol_freq,Max HR_freq,ST depression_freq
0,0.715209,0.919993,0.274004,0.508586,0.407499,0.391757,0.086391,0.523133,0.066114,0.00767,0.026622,0.026344,0.005474
1,0.715209,0.919993,0.725996,0.489296,0.567792,0.590871,0.707054,0.045386,0.048451,0.03268,0.005988,0.016952,0.49923
2,0.284791,0.919993,0.725996,0.489296,0.567792,0.590871,0.707054,0.118903,0.033577,0.038563,0.00623,0.01934,0.49923
3,0.284791,0.919993,0.725996,0.489296,0.407499,0.590871,0.707054,0.312578,0.040506,0.008858,0.020896,0.027539,0.045521
4,0.715209,0.919993,0.274004,0.489296,0.407499,0.590871,0.036416,0.523133,0.066114,0.158524,0.046934,0.030951,0.002081


K-fold Target Encoding (leak-safe)

Idea: for each value in a column, compute the mean target rate.
But for training rows, we must compute it using only the other folds (OOF encoding), to avoid leakage.

In [4]:
def kfold_target_encode(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    y: np.ndarray,
    cols: list[str],
    n_splits: int = 5,
    seed: int = 42,
    smoothing: float = 20.0,
):
    """
    Returns:
      tr_te: OOF target-encoded features for train
      te_te: target-encoded features for test (fit on full train)
    Smoothing shrinks category means toward global mean for rare categories.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = float(np.mean(y))

    tr_te = pd.DataFrame(index=train_df.index)
    te_te = pd.DataFrame(index=test_df.index)

    # precompute full-train encodings for test
    full = train_df.copy()
    full["_y"] = y

    for c in cols:
        tr_col = np.zeros(len(train_df), dtype=float)

        for tr_idx, va_idx in skf.split(train_df, y):
            fold_tr = train_df.iloc[tr_idx]
            fold_y = y[tr_idx]

            tmp = fold_tr.copy()
            tmp["_y"] = fold_y

            stats = tmp.groupby(c)["_y"].agg(["mean", "count"])
            # smoothing: (mean*count + global*smoothing) / (count+smoothing)
            enc = (stats["mean"] * stats["count"] + global_mean * smoothing) / (stats["count"] + smoothing)

            tr_col[va_idx] = train_df.iloc[va_idx][c].map(enc).fillna(global_mean).values

        tr_te[c + "_te"] = tr_col

        # test encoding trained on full train
        stats_full = full.groupby(c)["_y"].agg(["mean", "count"])
        enc_full = (stats_full["mean"] * stats_full["count"] + global_mean * smoothing) / (stats_full["count"] + smoothing)
        te_te[c + "_te"] = test_df[c].map(enc_full).fillna(global_mean).astype(float)

    return tr_te, te_te

tr_te, te_te = kfold_target_encode(X_raw, X_test_raw, y, feature_cols, n_splits=5, seed=42, smoothing=20.0)
print("te features:", tr_te.shape)
tr_te.head()


te features: (630000, 13)


Unnamed: 0,Sex_te,FBS over 120_te,Exercise angina_te,EKG results_te,Slope of ST_te,Thallium_te,Number of vessels fluro_te,Chest pain type_te,Age_te,BP_te,Cholesterol_te,Max HR_te,ST depression_te
0,0.556009,0.443425,0.805842,0.341249,0.692057,0.815621,0.896225,0.697473,0.573631,0.442373,0.456173,0.442252,0.891576
1,0.555978,0.443555,0.313306,0.559942,0.262694,0.197872,0.303324,0.10804,0.374496,0.456378,0.3973,0.191128,0.269296
2,0.17885,0.443342,0.313703,0.55982,0.262022,0.198211,0.303122,0.16244,0.501026,0.446347,0.287561,0.315335,0.269478
3,0.17885,0.443342,0.313703,0.55982,0.692425,0.198211,0.303122,0.190723,0.302855,0.460374,0.422413,0.520873,0.583259
4,0.555978,0.443555,0.80655,0.559942,0.691692,0.197872,0.898491,0.697874,0.574468,0.481239,0.420032,0.780263,0.87509


Assemble final features

We can include:

freq features

target encoding features

(optional) raw numeric features too (sometimes helps)

Let’s start with encodings only (cleanest).

In [5]:
X_fe = pd.concat([tr_freq, tr_te], axis=1)
X_test_fe = pd.concat([te_freq, te_te], axis=1)

# ensure no NaNs
X_fe = X_fe.fillna(0.0)
X_test_fe = X_test_fe.fillna(0.0)

print("X_fe:", X_fe.shape, "X_test_fe:", X_test_fe.shape)


X_fe: (630000, 26) X_test_fe: (270000, 26)


LightGBM CV on engineered features

In [6]:
params = dict(
    objective="binary",
    metric="auc",
    learning_rate=0.03,
    num_leaves=64,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l2=10.0,
    verbose=-1,
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X_fe), dtype=float)
test_pred = np.zeros(len(X_test_fe), dtype=float)
scores = []
best_iters = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fe, y), start=1):
    X_tr, X_va = X_fe.iloc[tr_idx], X_fe.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    dtr = lgb.Dataset(X_tr, label=y_tr)
    dva = lgb.Dataset(X_va, label=y_va)

    model = lgb.train(
        params,
        dtr,
        num_boost_round=10000,
        valid_sets=[dva],
        callbacks=[
            lgb.early_stopping(200, verbose=False),
        ],
    )

    p_va = model.predict(X_va, num_iteration=model.best_iteration)
    oof[va_idx] = p_va

    auc = roc_auc_score(y_va, p_va)
    scores.append(auc)
    best_iters.append(model.best_iteration)

    test_pred += model.predict(X_test_fe, num_iteration=model.best_iteration) / skf.n_splits

    print(f"Fold {fold} AUC: {auc:.5f} | best_iter: {model.best_iteration}")

print("Mean CV AUC:", round(float(np.mean(scores)), 5))
print("OOF  AUC:", round(float(roc_auc_score(y, oof)), 5))
print("Avg best_iter:", int(np.mean(best_iters)))


Fold 1 AUC: 0.95583 | best_iter: 317
Fold 2 AUC: 0.95470 | best_iter: 296
Fold 3 AUC: 0.95543 | best_iter: 333
Fold 4 AUC: 0.95509 | best_iter: 408
Fold 5 AUC: 0.95596 | best_iter: 401
Mean CV AUC: 0.9554
OOF  AUC: 0.95539
Avg best_iter: 351


Save preds + submission

In [7]:
np.save(REPORTS_DIR / "oof_lgbm_tefreq.npy", oof)
np.save(REPORTS_DIR / "test_lgbm_tefreq.npy", test_pred)
print("Saved OOF/test npy")

submission = pd.DataFrame({
    id_col: test[id_col],
    target_col: test_pred
})

out_path = REPORTS_DIR / "sub_lgbm_tefreq.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
submission.head()


Saved OOF/test npy
Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_lgbm_tefreq.csv


Unnamed: 0,id,Heart Disease
0,630000,0.961009
1,630001,0.009377
2,630002,0.98663
3,630003,0.006866
4,630004,0.196984


Measure correlation between model predictions

In [8]:
import numpy as np

test_cat = np.load(REPORTS_DIR / "test_catboost.npy")
test_lgb_tefreq = np.load(REPORTS_DIR / "test_lgbm_tefreq.npy")

corr = np.corrcoef(test_cat, test_lgb_tefreq)[0,1]
print("Prediction correlation (test):", corr)


Prediction correlation (test): 0.9985160277556729


Next step: add “Apriori-lite” interaction features + train CatBoost on engineered features

Build interaction features (top 8 TE pairs)

In [9]:
import numpy as np
import pandas as pd
from itertools import combinations

# tr_te, te_te already exist from your TE step (cell 4)
base = tr_te.fillna(0.0)

corr_scores = {}
cols = base.columns.tolist()

for a, b in combinations(cols, 2):
    corr_scores[(a, b)] = abs(np.corrcoef(base[a].values, base[b].values)[0, 1])

top_pairs = sorted(corr_scores, key=corr_scores.get, reverse=True)[:8]
print("Top pairs:")
for p in top_pairs:
    print(p, corr_scores[p])

# add product features
tr_inter = pd.DataFrame(index=tr_te.index)
te_inter = pd.DataFrame(index=te_te.index)

for a, b in top_pairs:
    name = f"{a}_x_{b}"
    tr_inter[name] = tr_te[a].fillna(0.0) * tr_te[b].fillna(0.0)
    te_inter[name] = te_te[a].fillna(0.0) * te_te[b].fillna(0.0)

print("Interactions:", tr_inter.shape)
tr_inter.head()


Top pairs:
('Slope of ST_te', 'ST depression_te') 0.46911681318768345
('Thallium_te', 'Chest pain type_te') 0.3635805478137517
('Exercise angina_te', 'Thallium_te') 0.35705192860969265
('Thallium_te', 'Number of vessels fluro_te') 0.35302016701573574
('Thallium_te', 'Max HR_te') 0.33476567326328555
('Thallium_te', 'ST depression_te') 0.3305191760921282
('Number of vessels fluro_te', 'ST depression_te') 0.3116851234723204
('Slope of ST_te', 'Thallium_te') 0.310597349005315
Interactions: (630000, 8)


Unnamed: 0,Slope of ST_te_x_ST depression_te,Thallium_te_x_Chest pain type_te,Exercise angina_te_x_Thallium_te,Thallium_te_x_Number of vessels fluro_te,Thallium_te_x_Max HR_te,Thallium_te_x_ST depression_te,Number of vessels fluro_te_x_ST depression_te,Slope of ST_te_x_Thallium_te
0,0.617021,0.568874,0.657262,0.73098,0.36071,0.727189,0.799053,0.564456
1,0.070743,0.021378,0.061994,0.060019,0.037819,0.053286,0.081684,0.05198
2,0.070609,0.032197,0.062179,0.060082,0.062503,0.053413,0.081685,0.051936
3,0.403863,0.037803,0.062179,0.060082,0.103243,0.115608,0.176798,0.137246
4,0.605292,0.138089,0.159593,0.177786,0.154392,0.173155,0.78626,0.136866


Final engineered matrices

In [10]:
X_train = pd.concat([tr_freq, tr_te, tr_inter], axis=1).fillna(0.0)
X_test  = pd.concat([te_freq, te_te, te_inter], axis=1).fillna(0.0)

print("X_train:", X_train.shape, "X_test:", X_test.shape)


X_train: (630000, 34) X_test: (270000, 34)


CatBoost GPU CV on engineered features (like the Kaggle approach)

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import numpy as np
import time

cat_params = dict(
    iterations=10000,
    learning_rate=0.01,
    depth=2,
    loss_function="Logloss",
    eval_metric="AUC",
    auto_class_weights="Balanced",
    subsample=0.65,
    l2_leaf_reg=12,
    random_strength=1.2,
    bootstrap_type="Bernoulli",
    od_type="Iter",
    od_wait=300,
    task_type="GPU",
    devices="0",
    verbose=False,
    allow_writing_files=False,
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X_train), dtype=float)
test_pred = np.zeros(len(X_test), dtype=float)
scores, best_iters = [], []

t0 = time.time()

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y), start=1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    tr_pool = Pool(X_tr, y_tr)
    va_pool = Pool(X_va, y_va)

    model = CatBoostClassifier(**cat_params, random_seed=42 + fold)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    p_va = model.predict_proba(va_pool)[:, 1]
    oof[va_idx] = p_va

    auc = roc_auc_score(y_va, p_va)
    scores.append(auc)

    bi = model.get_best_iteration()
    best_iters.append(bi)

    test_pred += model.predict_proba(Pool(X_test))[:, 1] / skf.n_splits

    print(f"Fold {fold} AUC: {auc:.5f} | best_iter: {bi}")

t1 = time.time()

print("Mean CV AUC:", round(float(np.mean(scores)), 5))
print("OOF  AUC:", round(float(roc_auc_score(y, oof)), 5))
print("Avg best_iter:", int(np.mean(best_iters)))
print("Runtime (min):", round((t1 - t0)/60, 2))


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 AUC: 0.95607 | best_iter: 9382


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 AUC: 0.95485 | best_iter: 6503


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3 AUC: 0.95559 | best_iter: 8824


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4 AUC: 0.95535 | best_iter: 7670


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5 AUC: 0.95622 | best_iter: 8061
Mean CV AUC: 0.95562
OOF  AUC: 0.95561
Avg best_iter: 8088
Runtime (min): 3.22


Save + submission

In [12]:
import pandas as pd
import numpy as np

np.save(REPORTS_DIR / "oof_catboost_tefreq_inter.npy", oof)
np.save(REPORTS_DIR / "test_catboost_tefreq_inter.npy", test_pred)

out = pd.DataFrame({id_col: test[id_col], target_col: test_pred})
out_path = REPORTS_DIR / "sub_catboost_tefreq_inter.csv"
out.to_csv(out_path, index=False)

print("Saved:", out_path)
out.head()


Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_catboost_tefreq_inter.csv


Unnamed: 0,id,Heart Disease
0,630000,0.955289
1,630001,0.008726
2,630002,0.992502
3,630003,0.004323
4,630004,0.238378
