In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from tqdm import tqdm
import joblib

# =========================================================
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨
# =========================================================
train = pd.read_csv(r"D:\toss\train.csv")

train['gender'].fillna(2, inplace=True)
train['age_group'].fillna(1, inplace=True)
train.drop(columns=["l_feat_17"], inplace=True)
train.fillna(0, inplace=True)

cat_cols = ['gender', 'age_group', 'inventory_id', 'day_of_week', 'hour']
train[cat_cols] = train[cat_cols].fillna("UNK")
train = pd.get_dummies(train, columns=cat_cols)

feature_cols = [c for c in train.columns if c not in ["ID", "clicked", "seq", "seq_list"]]
target_col = "clicked"

X = train[feature_cols]
y = train[target_col].values

# =========================================================
# Ïª§Ïä§ÌÖÄ Î©îÌä∏Î¶≠
# =========================================================
def weighted_logloss(y_true, y_pred):
    eps = 1e-15
    y_pred = np.clip(y_pred, eps, 1 - eps)
    pos_frac = y_true.mean()
    neg_frac = 1 - pos_frac
    w1 = 0.5 / pos_frac
    w0 = 0.5 / neg_frac
    return -np.mean(w1 * y_true * np.log(y_pred) + w0 * (1 - y_true) * np.log(1 - y_pred))

def evaluate_score(y_true, y_pred):
    ap = average_precision_score(y_true, y_pred)
    wll = weighted_logloss(y_true, y_pred)
    return ap, wll, 0.5 * ap + 0.5 * (1 / (1 + wll))

# =========================================================
# XGBoost ÌååÎùºÎØ∏ÌÑ∞
# =========================================================
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "gpu_hist",
    "learning_rate": 0.010518720765916168,
    "max_depth": 10,
    "subsample": 0.883115119152337,
    "colsample_bytree": 0.6134018590653787,
    "lambda": 0.01941490269309615,
    "alpha": 0.6732668631963976,
    "min_child_weight": 6.9674958503622655,
    "seed": 42
}

# =========================================================
# Stratified KFold ÌïôÏäµ
# =========================================================
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))
fold_scores = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\nüöÄ Fold {fold+1}/{kf.n_splits}")

    dtrain = xgb.DMatrix(X.iloc[tr_idx], label=y[tr_idx])
    dvalid = xgb.DMatrix(X.iloc[val_idx], label=y[val_idx])

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1500,
        evals=[(dvalid, "valid")],
        early_stopping_rounds=50,
        verbose_eval=False
     )

    val_preds = model.predict(dvalid)
    oof_preds[val_idx] = val_preds

    ap, wll, score = evaluate_score(y[val_idx], val_preds)
    print(f"‚úÖ Fold {fold+1}: AP={ap:.5f}, WLL={wll:.5f}, Score={score:.5f}")
    fold_scores.append(score)

print("\n=======================================")
print(f"üéØ ÌèâÍ∑† Score: {np.mean(fold_scores):.5f}")
print("=======================================")

# =========================================================
# Ï†ÑÏ≤¥ ÌïôÏäµ Î∞è Î™®Îç∏ Ï†ÄÏû•
# =========================================================
dtrain_full = xgb.DMatrix(X, label=y)
final_model = xgb.train(params, dtrain_full, num_boost_round=1500)
final_model.save_model("xgb_model.json")

joblib.dump(feature_cols, "xgb_feature_cols.pkl")
print("‚úÖ Î™®Îç∏ Î∞è feature Ï†ÄÏû• ÏôÑÎ£å: model.json, feature_cols.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['gender'].fillna(2, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['age_group'].fillna(1, inplace=True)



üöÄ Fold 1/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 1: AP=0.61045, WLL=0.64488, Score=0.60920

üöÄ Fold 2/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 2: AP=0.61159, WLL=0.64433, Score=0.60987

üöÄ Fold 3/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 3: AP=0.61430, WLL=0.64398, Score=0.61129

üöÄ Fold 4/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 4: AP=0.61631, WLL=0.64046, Score=0.61295

üöÄ Fold 5/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 5: AP=0.61318, WLL=0.64342, Score=0.61083

üöÄ Fold 6/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 6: AP=0.61169, WLL=0.64371, Score=0.61004

üöÄ Fold 7/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 7: AP=0.61155, WLL=0.64391, Score=0.60993

üöÄ Fold 8/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 8: AP=0.61468, WLL=0.64196, Score=0.61185

üöÄ Fold 9/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 9: AP=0.61441, WLL=0.64215, Score=0.61168

üöÄ Fold 10/10



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()

    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


‚úÖ Fold 10: AP=0.61095, WLL=0.64522, Score=0.60939

üéØ ÌèâÍ∑† Score: 0.61070



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  final_model.save_model("model.json")


‚úÖ Î™®Îç∏ Î∞è feature Ï†ÄÏû• ÏôÑÎ£å: model.json, feature_cols.pkl


: 