만약 Colab environment인 경우 다음과 같은 코드 실행

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/2026PredictingBDACompletion/src')

Mounted at /content/drive


Local environment의 경우 cd 명령어를 사용하여 working directory 조정

최종적으로, 다음 코드 조각이 이 파일이 속한 디렉토리를 지시해야 함

In [2]:
print(os.getcwd())

/content/drive/MyDrive/2026PredictingBDACompletion/src


In [3]:
!pip install -r ../requirements.txt

Collecting catboost==1.2.8 (from -r ../requirements.txt (line 5))
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting torch==2.9.0 (from -r ../requirements.txt (line 6))
  Downloading torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting optuna==4.6.0 (from -r ../requirements.txt (line 7))
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting tqdm==4.67.1 (from -r ../requirements.txt (line 8))
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-nvshmem-cu12==3.3.20 (from torch==2.9.0->-r ../requirements.txt (line 6))
  Downloading nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Collecting triton==3.5.0 (from torch==2.9.0->-r ../requirements.txt (line 6))
  Downloading triton-3.5.0-cp312-cp312-manylinux_2_27

In [4]:
import json
import os
import random
import sys
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score, log_loss
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
import xgboost as xgb
import torch

# =======================
# CONFIG
# =======================
BASE_DIR = Path("..")
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
OUTPUT_PATH = OUTPUT_DIR / "submission_xgboost.csv"

GLOBAL_SEED = 42

TARGET_COL = "completed"
ID_COL = "ID"

GROUP_COLS = ["school1"]
GROUP_KEY_NAME = "group__school1"

FORCE_CAT_COLS = ["school1", "class1", "class2", "class3", "class4"]

MANUAL_TEXT_COLS = [
    "whyBDA", "what_to_gain", "incumbents_lecture",
    "certificate_acquisition", "incumbents_lecture_scale_reason", "onedayclass_topic"
]

N_SPLITS = 5

MODE = 'Load'
OPTUNA_SEED = 42
OPTUNA_TRIALS = 100
FINAL_TRAIN_SEEDS = [42, 142, 242, 342, 442, 542, 642, 742, 842, 942]

# =======================
# REPRODUCIBILITY
# =======================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

# =======================
# UTILS
# =======================
def find_best_threshold(y_true, prob, step=0.001):
    best_t, best_f1 = 0.5, -1.0
    for t in np.arange(0.1, 0.9, step):
        pred = (prob >= t).astype(int)
        f1 = f1_score(y_true, pred)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), float(best_f1)

def make_submission_df(sample_sub, test_df, preds, id_col, target_col):
    ids = sample_sub[id_col].copy() if id_col in sample_sub.columns else test_df[id_col].copy()
    out = pd.DataFrame({id_col: ids})
    out[target_col] = preds
    return out

def build_group_key(df: pd.DataFrame, cols, key_name: str) -> pd.Series:
    key = df[cols[0]].astype("string").fillna("MISSING")
    for c in cols[1:]:
        key = key + "__" + df[c].astype("string").fillna("MISSING")
    return key

# =======================
# FE & ENCODING
# =======================
def apply_fe(df):
    # 1. IT Major
    keyword_regex = 'IT|정보|컴퓨터|소프트|인공지능|AI|데이터|SW|ICT'
    if 'major_field' in df.columns:
        df['is_it'] = df['major_field'].astype(str).str.contains(keyword_regex, case=False, na=False).astype(int)

    # 2. Certificate
    if 'certificate_acquisition' in df.columns:
        df['cert_count'] = df['certificate_acquisition'].fillna("").apply(lambda x: x.count(',') + 1 if x != "" else 0)
        df['has_adsp'] = df['certificate_acquisition'].fillna("").str.contains('ADsP', case=False).astype(int)
        df['has_sqld'] = df['certificate_acquisition'].fillna("").str.contains('SQLD', case=False).astype(int)

    # 3. Job Keywords
    if 'desired_job_except_data' in df.columns:
        df['want_uiux'] = df['desired_job_except_data'].fillna("").str.contains('UI|UX', case=False).astype(int)
        df['want_pm'] = df['desired_job_except_data'].fillna("").str.contains('PM|기획', case=False).astype(int)

    # 4. School
    if 'school1' in df.columns:
        df['is_school_0'] = (df['school1'] == 0).astype(int)

    return df

def clean_lecture(train, test):
    col = 'incumbents_lecture'
    if col in train.columns:
        top3 = train[col].value_counts().nlargest(3).index.tolist()
        def clean(x): return x if x in top3 else top3[0]
        train[col] = train[col].apply(clean)
        test[col] = test[col].apply(clean)
    return train, test

def encode_categories(train, test):
    """
    Fits LabelEncoder ONLY on Train data.
    Unseen labels in Test data are mapped to -1.
    """
    # 1. Text columns -> Length features
    for c in MANUAL_TEXT_COLS:
        if c in train.columns:
            train[f'{c}_len'] = train[c].fillna("").apply(len)
            test[f'{c}_len'] = test[c].fillna("").apply(len)
            train = train.drop(columns=[c])
            test = test.drop(columns=[c])

    # 2. Categorical columns -> Custom Label Encoding
    cat_cols = [c for c in train.columns if train[c].dtype == 'object']

    for c in cat_cols:
        # Fill NA with specific string to treat it as a category
        train_vals = train[c].fillna("MISSING").astype(str)
        test_vals = test[c].fillna("MISSING").astype(str)

        # Create mapping ONLY from Train unique values
        unique_train = train_vals.unique()
        mapping = {val: i for i, val in enumerate(unique_train)}

        # Transform Train
        train[c] = train_vals.map(mapping)

        # Transform Test (Map unknown values to -1)
        test[c] = test_vals.apply(lambda x: mapping.get(x, -1))

        # Convert to int
        train[c] = train[c].astype(int)
        test[c] = test[c].astype(int)

    return train, test

# =======================
# OPTUNA
# =======================
def objective(trial, X, y, groups):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.5, 4.0),

        'objective': 'binary:logistic',
        'eval_metric': 'logloss',

        'tree_method': 'hist',
        'device': 'cuda',

        'early_stopping_rounds': 50,
        'random_state': OPTUNA_SEED,
        'verbosity': 0
    }

    scores = []
    seed_everything(OPTUNA_SEED)

    cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=OPTUNA_SEED)
    splitter = cv.split(X, y, groups=groups)

    for tr_idx, va_idx in splitter:
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = xgb.XGBClassifier(**params)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )

        pred_proba = model.predict_proba(X_va)[:, 1]
        score_ll = log_loss(y_va, pred_proba)
        scores.append(score_ll)

    return np.mean(scores)

# =======================
# FINAL TRAIN
# =======================
def run_training(best_params, X, y, groups, X_test):
    print("[Train] XGBoost with Seed Ensemble")

    best_params.update({
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'device': 'cuda',
        'verbosity': 0,
        'early_stopping_rounds': 100
    })

    oof_preds = []
    test_preds = []

    for seed in FINAL_TRAIN_SEEDS:
        seed_everything(seed)
        best_params['random_state'] = seed

        oof = np.zeros(len(X))
        test_p = []

        cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
        splitter = cv.split(X, y, groups=groups)

        for tr, va in splitter:
            model = xgb.XGBClassifier(**best_params)
            model.fit(
                X.iloc[tr], y.iloc[tr],
                eval_set=[(X.iloc[va], y.iloc[va])],
                verbose=False
            )

            oof[va] = model.predict_proba(X.iloc[va])[:, 1]
            test_p.append(model.predict_proba(X_test)[:, 1])

        oof_preds.append(oof)
        test_preds.append(np.mean(test_p, axis=0))

        best_t, best_f1 = find_best_threshold(y, oof, step=0.001)
        print(f"[Seed {seed}] Best F1: {best_f1:.4f} @ {best_t:.3f}")

    ens_oof = np.mean(oof_preds, axis=0)
    ens_test = np.mean(test_preds, axis=0)

    return ens_oof, ens_test

# =======================
# MAIN
# =======================
def main():
    seed_everything(GLOBAL_SEED)

    print(f"[Dataset] Trying to load all files from {DATA_DIR}")

    train_path = DATA_DIR / "train.csv"
    if not train_path.exists():
        raise FileNotFoundError(f"[Error] Dataset not found at {train_path}")
        return
    else:
        try:
            train = pd.read_csv(train_path, encoding="utf-8-sig")
        except:
            raise Exception(f"[Error] Train dataset loading failed from {train_path}")
            return

    test_path = DATA_DIR / "test.csv"
    if not test_path.exists():
        raise FileNotFoundError(f"[Error] Dataset not found at {test_path}")
    else:
        try:
            test = pd.read_csv(test_path, encoding="utf-8-sig")
        except:
            raise Exception(f"[Error] Test dataset loading failed from {test_path}")
            return

    sub_path = DATA_DIR / "sample_submission.csv"
    if not sub_path.exists():
        raise FileNotFoundError(f"[Error] Dataset not found at {sub_path}")
        return
    else:
        try:
            sub = pd.read_csv(sub_path, encoding="utf-8-sig")
        except:
            raise Exception(f"[Error] Submission template loading failed from {sub_path}")
            return

    print(f"[Dataset] All data loaded successfully from {DATA_DIR}")

    groups = build_group_key(train, GROUP_COLS, GROUP_KEY_NAME)

    print("\n[Pre-Processing] Applying FE...")
    train = apply_fe(train)
    test = apply_fe(test)
    train, test = clean_lecture(train, test)

    # Drop
    drops = [ID_COL, 'major_field', 'generation', 'incumbents_level', TARGET_COL]
    y = train[TARGET_COL].astype(int)
    X = train.drop(columns=drops, errors='ignore')

    X_test = test.drop(columns=drops, errors='ignore')
    X_test = X_test.reindex(columns=X.columns, fill_value=np.nan)

    # Nan Count & Log Time
    X['nan_count'] = X.isnull().sum(axis=1)
    X_test['nan_count'] = X_test.isnull().sum(axis=1)
    if 'time_input' in X.columns:
        X['log_time_input'] = np.log1p(pd.to_numeric(X['time_input'], errors='coerce').fillna(0))
        X_test['log_time_input'] = np.log1p(pd.to_numeric(X_test['time_input'], errors='coerce').fillna(0))

    # Encoding
    print("[Encoding] Fitting LE on Train ONLY...")
    X, X_test = encode_categories(X, X_test)

    print(f"[Pre-Processing] {X.shape[1]} features prepared.")

    print(f"\n[Info] Current Mode is {MODE}")

    if MODE == 'Tune':
        # Optuna
        print("\n[Optuna] Tuning XGBoost (GPU)...")
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=GLOBAL_SEED))
        func = lambda trial: objective(trial, X, y, groups)
        study.optimize(func, n_trials=OPTUNA_TRIALS)

        print(f"[Optuna] Best Logloss: {study.best_value:.4f}")
        print(f"[Optuna] Best Params: {study.best_params}")
        best_params = study.best_params
    elif MODE == 'Load':
        # Train with previous best hyperparameters
        best_params = {'n_estimators': 1455, 'learning_rate': 0.01581233636756958, 'max_depth': 4, 'min_child_weight': 8, 'gamma': 1.8554367882106972, 'subsample': 0.5178143107493339, 'colsample_bytree': 0.7705593029127545, 'reg_alpha': 0.15001414550285752, 'reg_lambda': 0.11106665752617169, 'scale_pos_weight': 1.5086109724471912}
        print(f"\n[Load] Load Previous Best Params: {best_params}")
    else:
        raise ValueError(f'[Error] Invalid MODE (Must be "Tune" or "Load"): {MODE}')

    ens_oof, ens_test = run_training(best_params, X, y, groups, X_test)

    # Thresholding
    if MODE == 'Tune':
        best_t, best_f1 = find_best_threshold(y, ens_oof, step=0.001)
        print(f"\n[Post-Processing] OOF F1: {best_f1:.4f} (Thresh: {best_t:.3f})")
    elif MODE == 'Load':
        best_t = 0.351
        print(f"\n[Post-Processing] Threshold will be override to {best_t}")
    else:
        raise ValueError(f'[Error] Invalid MODE (Must be "Tune" or "Load"): {MODE}')

    # Submission
    final_pred = (ens_test >= best_t).astype(int)

    out = make_submission_df(sub, test, final_pred, ID_COL, TARGET_COL)
    out.to_csv(OUTPUT_PATH, index=False)

    print(f"\n[Result] Saved: {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

[Dataset] Trying to load all files from ../data
[Dataset] All data loaded successfully from ../data

[Pre-Processing] Applying FE...
[Encoding] Fitting LE on Train ONLY...
[Pre-Processing] 50 features prepared.

[Info] Current Mode is Load

[Load] Load Previous Best Params: {'n_estimators': 1455, 'learning_rate': 0.01581233636756958, 'max_depth': 4, 'min_child_weight': 8, 'gamma': 1.8554367882106972, 'subsample': 0.5178143107493339, 'colsample_bytree': 0.7705593029127545, 'reg_alpha': 0.15001414550285752, 'reg_lambda': 0.11106665752617169, 'scale_pos_weight': 1.5086109724471912}
[Train] XGBoost with Seed Ensemble
[Seed 42] Best F1: 0.4645 @ 0.369
[Seed 142] Best F1: 0.4838 @ 0.307
[Seed 242] Best F1: 0.4766 @ 0.269
[Seed 342] Best F1: 0.4753 @ 0.347
[Seed 442] Best F1: 0.4776 @ 0.281
[Seed 542] Best F1: 0.4625 @ 0.279
[Seed 642] Best F1: 0.4715 @ 0.258
[Seed 742] Best F1: 0.4610 @ 0.218
[Seed 842] Best F1: 0.4809 @ 0.315
[Seed 942] Best F1: 0.4806 @ 0.334

[Post-Processing] Threshold w