In [1]:
%run import_libs.py

### get data

In [2]:
df_train = get_train_data(TRAIN_PATH='./data/train.parquet')
num_features = pd.read_csv("num_feats_after_filtering.csv")["0"].to_list()

df_train_agg = get_df_w_aggrs(df=df_train, feats=num_features)
df_train_target = get_target(TARGET_PATH='./data/train_labels.csv')
df_train = get_train_data_with_target_merged(df_train=df_train_agg, df_train_target=df_train_target)

(458913, 151)
(458913, 151)
(458913, 151)
(458913, 162)
(458913, 616)


In [7]:
df_train.head(3)

Unnamed: 0,customer_ID,P_2_min,P_3_min,P_4_min,D_39_min,D_41_min,D_42_min,D_43_min,D_44_min,D_45_min,...,D_114_last,D_116_last,D_117_last,D_120_last,D_126_last,D_63_last,D_64_last,D_66_last,D_68_last,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.86858,0.581678,0.0,0,0.0,,,0,0.708906,...,1,0,5,0,2,0,0,-1,6,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.861109,0.510142,0.0,0,0.0,,0.060646,0,0.239459,...,1,0,0,0,2,3,0,-1,6,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.79767,0.381123,0.0,0,0.0,,,0,0.222406,...,1,0,0,0,2,3,2,-1,6,0


In [16]:
cat_features = [f"{f}_last" for f in ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']]
cat_features

['B_30_last',
 'B_38_last',
 'D_114_last',
 'D_116_last',
 'D_117_last',
 'D_120_last',
 'D_126_last',
 'D_63_last',
 'D_64_last',
 'D_66_last',
 'D_68_last']

In [17]:
payment_feats = []
delinq_feats = []
spend_feats = []
balance_feats = []
risk_feats = []

for feat in list(df_train):
    if feat in cat_features:
        continue
    
    if feat[0] == 'P':
        #print(feat)
        payment_feats.append(feat)
    elif feat[0] == 'D':
        delinq_feats.append(feat)
    elif feat[0] == 'S':
        spend_feats.append(feat)
    elif feat[0] == 'B':
        balance_feats.append(feat)
    elif feat[0] == 'R':
        risk_feats.append(feat)

In [18]:
len(payment_feats) + len(delinq_feats) + len(spend_feats) + len(balance_feats) + len(risk_feats)

604

In [21]:
num_features = payment_feats + delinq_feats + spend_feats + balance_feats + risk_feats
len(num_features)

604

### Backward feature selection (lgbm)

In [12]:
# ------------------ metric --------------------------------------------------
def get_amex_metric_calculated(y_true, y_pred):
    n_pos = y_true.sum(); n_neg = y_true.size - n_pos
    idx = np.argsort(y_pred)[::-1]; target = y_true[idx]
    weight = 20 - 19 * target; cum_w = (weight / weight.sum()).cumsum()
    d = target[cum_w <= .04].sum() / n_pos
    lor = (target / n_pos).cumsum(); g = ((lor - cum_w) * weight).sum()
    g_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))
    return .5 * (g / g_max + d)

def lgb_amex_metric(y_true, y_pred): return ('amex', get_amex_metric_calculated(y_true, y_pred), True)

In [None]:
# ------------------ backward + tuning --------------------------------------
def backward_optuna_lgb(df, y_col, num_feats, cat_feats, group_col,
                        min_feats=20, random_state=42, trials_per_tune=30):

    num, cat = num_feats.copy(), cat_feats.copy()
    hist, step = [], 0
    sgkf = StratifiedGroupKFold(5, shuffle=True, random_state=random_state)

    # --- default hp (will evolve) ---
    best_params = dict(
        learning_rate   = 0.05,
        subsample       = 0.9,
        subsample_freq  = 1,      # make subsample actually active
        colsample_bytree= 0.8,
        objective       = 'binary',
        random_state    = random_state,
        max_depth       = 5,
        num_leaves      = 64,
        n_estimators    = 350,
        verbosity=-1,
    )

    def cv_score(features, params):
        scores = []
        for tr, va in sgkf.split(df[[group_col, y_col]], df[y_col], df[group_col]):
            Xtr, Xva = df.loc[tr, features], df.loc[va, features]
            ytr, yva = df[y_col].iloc[tr].values, df[y_col].iloc[va].values
            mdl = LGBMClassifier(**params); 
            mdl.fit(Xtr, ytr, categorical_feature=[f for f in cat if f in features], verbose=False)
            scores.append(get_amex_metric_calculated(yva, mdl.predict_proba(Xva)[:,1]))
        return float(np.mean(scores))

    def decide_drop(n):
        if n > 250: return max(1, int(n*.10))
        if n > 100: return max(1, int(n*.05))
        if n > 30 : return max(1, int(n*.02))
        return 1

    while True:
        feats = num + cat
        if len(feats) <= min_feats: break
        step += 1

        # ---- optuna tuning occasionally -------------------------------
        if step % 5 == 1:                    # 1,6,11,...
            def objective(trial):
                params = best_params | {
                    'max_depth'  : trial.suggest_int('max_depth', 3, 8),
                    'num_leaves' : trial.suggest_int('num_leaves', 2**4, 2**8),
                    'n_estimators': trial.suggest_int('n_estimators', 200, 500)
                }
                return -cv_score(feats, params)   # Optuna minimises
            study = optuna.create_study(direction='minimize',
                                         sampler=optuna.samplers.TPESampler(seed=random_state),
                                         pruner=optuna.pruners.MedianPruner())
            study.optimize(objective, n_trials=trials_per_tune, show_progress_bar=False)
            best_params |= {k:v for k,v in study.best_params.items()}

        # ---- CV with current params -----------------------------------
        mean_cv = cv_score(feats, best_params)
        ci_lo, ci_hi = stats.t.interval(0.95, 4, loc=mean_cv, scale=0 if np.isnan(mean_cv) else stats.sem([mean_cv]*5))

        # ---- gain importance on full data -----------------------------
        model_full = LGBMClassifier(**best_params)
        model_full.fit(df[feats], df[y_col], categorical_feature=cat, verbose=False)
        gain = model_full.booster_.feature_importance('gain')
        imp_df = pd.DataFrame({'feature':feats,'gain':gain}).query('gain>0').sort_values('gain')

        n_drop = min(decide_drop(len(feats)), len(imp_df))
        drop_list = imp_df.head(n_drop)['feature'].tolist()
        num = [f for f in num if f not in drop_list]
        cat = [f for f in cat if f not in drop_list]

        hist.append({
            'step': step,
            'n_feats_left': len(feats),
            'n_num_left': len(num),
            'n_cat_left': len(cat),
            'cv_mean': mean_cv,
            'cv_ci_lo': ci_lo,
            'cv_ci_hi': ci_hi,
            'dropped': drop_list,
            'params': best_params.copy()
        })

    log_df = pd.DataFrame(hist)
    final_feats = num + cat
    final_model = LGBMClassifier(**best_params).fit(df[final_feats], df[y_col],
                                                       categorical_feature=cat, verbose=False)
    return log_df, {'num': num, 'cat': cat}, final_model


In [23]:
len(num_features), len(cat_features)

(604, 11)

In [27]:
log, remaining, model = backward_optuna_lgb(
        df=df_train,
        y_col='target',
        num_feats=num_features,
        cat_feats=cat_features,
        group_col='customer_ID',
        trials_per_tune=40)   # raise if you want deeper search

print(log[['step','n_feats_left','cv_mean','dropped','params']].head())
print("Left:", len(remaining['num']), "num &", len(remaining['cat']), "cat")

[I 2025-05-04 13:42:21,090] A new study created in memory with name: no-name-fd9ad5d3-e5e5-44ee-8858-7fc79da99e53
[W 2025-05-04 13:43:22,613] Trial 0 failed with parameters: {'max_depth': 5, 'num_leaves': 245, 'n_estimators': 420} because of the following error: TypeError("LGBMClassifier.fit() got an unexpected keyword argument 'verbose'").
Traceback (most recent call last):
  File "c:\Users\oobur\Projects\spbu_master\sem4\vkr\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\oobur\AppData\Local\Temp\ipykernel_8292\2593188712.py", line 51, in objective
    return -cv_score(feats, params)   # Optuna minimises
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\oobur\AppData\Local\Temp\ipykernel_8292\2593188712.py", line 28, in cv_score
    mdl.fit(Xtr, ytr, categorical_feature=[f for f in cat if f in features], verbose=False)
TypeError: LGBMClassifier.fit() got an unexpected k

TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'verbose'