In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# -*- coding: utf-8 -*-
# cat_infer_minimal.py — CatBoost 추론 (훈련과 동일 컬럼/순서만 맞춤, 변환 없음)

import os, pickle, numpy as np, pandas as pd
from datetime import datetime
from catboost import CatBoostClassifier

# ===== 경로 =====
FEATURE_SAVE_DIR = "/content/drive/Othercomputers/내 Mac/Python/데이콘/현재 진행중/토스/outputs_test/features_postTE"
TRAIN_FEATURES_PKL = os.path.join(FEATURE_SAVE_DIR, "train_features_2.pkl")   # 훈련 피처
TEST_FEATURES_PKL  = os.path.join(FEATURE_SAVE_DIR, "test_features_2.pkl")    # 테스트 피처

CAT_DIR = "/content/drive/MyDrive/open/models/catboost_fixed_seedEnsemble2"
OUT_DIR = "/content/drive/MyDrive/open/preds"
os.makedirs(OUT_DIR, exist_ok=True)

target_col = "clicked"

# ===== 로드 =====
with open(TRAIN_FEATURES_PKL, "rb") as f:
    train_features = pickle.load(f)
with open(TEST_FEATURES_PKL, "rb") as f:
    test_features = pickle.load(f)

print(f"[Loaded] train: {train_features.shape} | test: {test_features.shape}")

# ===== 훈련과 동일한 feature_cols(순서)만 사용 =====
# 원본 그대로 쓰되, 훈련 때 사용한 컬럼 '그대로' 맞추기 (ID 등 제외 효과)
train_cols = [c for c in train_features.columns if c != target_col]
# 테스트에도 존재하는 것만 (稀한 불일치 방지)
feature_cols = [c for c in train_cols if c in test_features.columns]

print(f"[Schema] use {len(feature_cols)} columns (same order as training).")
print("         head ->", feature_cols[:8])

# ===== 추론 입력 (원본 그대로: dtype/형 변환/Pool 없음) =====
X_cat_test = test_features[feature_cols]

# ===== 모델 로드 =====
cat_models = []
for fname in sorted(os.listdir(CAT_DIR)):
    if fname.endswith(".cbm"):
        m = CatBoostClassifier()
        m.load_model(os.path.join(CAT_DIR, fname))
        cat_models.append(m)
if not cat_models:
    raise RuntimeError(f"No CatBoost .cbm models under {CAT_DIR}")

print(f"[CatBoost] models: {len(cat_models)} | X_test: {X_cat_test.shape}")

# ===== 추론 (평균) =====
preds = []
for i, m in enumerate(cat_models, 1):
    # 원본 DF 그대로 투입
    p = m.predict_proba(X_cat_test)[:, 1]
    preds.append(p)
    if i % 5 == 0 or i == len(cat_models):
        print(f"  inference: {i}/{len(cat_models)}")
pred_mean = np.mean(preds, axis=0)

# ===== 저장 (ID 있으면 같이) =====
pred_df = pd.DataFrame({"pred_cat": pred_mean})
if "ID" in test_features.columns:
    pred_df.insert(0, "ID", test_features["ID"])

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = os.path.join(OUT_DIR, f"pred_cat_{ts}.csv")
npy_path = os.path.join(OUT_DIR, f"pred_cat_{ts}.npy")
pred_df.to_csv(csv_path, index=False)
np.save(npy_path, pred_mean)

print(f"[Saved] {csv_path}")

  train_features = pickle.load(f)
  test_features = pickle.load(f)


[Loaded] train: (612537, 180) | test: (1527298, 180)
[Schema] use 179 columns (same order as training).
         head -> ['gender', 'age_group', 'inventory_id', 'day_of_week', 'hour', 'l_feat_1', 'l_feat_2', 'l_feat_3']
[CatBoost] models: 25 | X_test: (1527298, 179)
  inference: 5/25
  inference: 10/25
  inference: 15/25
  inference: 20/25
  inference: 25/25
[Saved] /content/drive/MyDrive/open/preds/pred_cat_20250915_090901.csv


In [7]:
# -*- coding: utf-8 -*-
# =========================================================
# XGBoost 추론 전용 스크립트 (경고 제거·안전 버전)
#  - 학습 때 저장한 post-TE PKL 로드
#  - train/test 동일 규칙으로 클린(drop/inf 처리)
#  - feature_cols = train/test 교집합
#  - 저장된 .json 모델들 로드 → 평균 → 제출
# =========================================================

import os, pickle
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from datetime import datetime
from xgboost import XGBClassifier

# ========= 경로 세팅 (필요 시 수정) =========
FEATURE_SAVE_DIR = "/content/drive/Othercomputers/내 Mac/Python/데이콘/현재 진행중/토스/outputs_test/features_postTE"
TRAIN_FEATURES_PKL = os.path.join(FEATURE_SAVE_DIR, "train_features_2.pkl")
TEST_FEATURES_PKL  = os.path.join(FEATURE_SAVE_DIR, "test_features_2.pkl")

MODEL_DIR   = "/content/drive/MyDrive/open/models/xgb_fixed_seedEnsemble3"  # 학습 코드에서 저장했던 폴더
SAMPLE_SUB  = "/content/drive/MyDrive/open/sample_submission.csv"
SAVE_DIR    = "/content/drive/MyDrive/open/preds"
os.makedirs(SAVE_DIR, exist_ok=True)

target_col = "clicked"

# ========= 로드 =========
if not (os.path.exists(TRAIN_FEATURES_PKL) and os.path.exists(TEST_FEATURES_PKL)):
    raise FileNotFoundError(f"PKL이 없습니다.\n- {TRAIN_FEATURES_PKL}\n- {TEST_FEATURES_PKL}")

with open(TRAIN_FEATURES_PKL, "rb") as f:
    train_features = pickle.load(f)
with open(TEST_FEATURES_PKL, "rb") as f:
    test_features = pickle.load(f)

print(f"[Loaded] train: {train_features.shape} | test: {test_features.shape}")

# ========= 클린 규칙 (훈련 때와 동일) =========
drop_cols = [
    "hour_bucket", "hour_bucket_simple", "dow_hour",
    "day_of_week__inventory_id", "gender__age_group",
    "is_weekend__hour_bucket", "hour_bucket__inventory_id"
]
drop_test_only = ["ID"]
log_hist_cols = [
    "history_a_2_log1p","history_a_4_log1p","history_a_5_log1p",
    "history_a_6_log1p","history_a_7_log1p"
]

def clean_features(df: pd.DataFrame, is_test=False) -> pd.DataFrame:
    out = df.copy()
    # 공통 드랍
    keep = [c for c in out.columns if c not in drop_cols]
    out = out[keep]
    # test 전용 드랍
    if is_test:
        keep2 = [c for c in out.columns if c not in drop_test_only]
        out = out[keep2]
    # inf -> NaN -> 0 (history 로그 컬럼)
    for c in log_hist_cols:
        if c in out.columns:
            out[c] = out[c].replace([np.inf, -np.inf], np.nan).fillna(0)
    return out

train_clean = clean_features(train_features, is_test=False)
test_clean  = clean_features(test_features,  is_test=True)

print(f"[Clean] train: {train_clean.shape} | test: {test_clean.shape}")

# ========= feature_cols = train/test 교집합 =========
ban = {target_col, 'ID'}  # 안전 제외
cols_train = set(train_clean.columns) - ban
cols_test  = set(test_clean.columns)  - ban
feature_cols = sorted(list(cols_train & cols_test))
print(f"[Features] using {len(feature_cols)} columns. head -> {feature_cols[:10]}")

# ========= XGB 입력 전처리 (훈련과 동일 로직, 경고 제거) =========
def _prep_xgb_features(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    X = df[cols].copy()
    for c in cols:
        dt = X[c].dtype
        if str(dt) == "object" or isinstance(dt, CategoricalDtype):
            X[c] = X[c].astype("category").cat.codes.astype("int32")  # NaN -> -1
        else:
            X[c] = pd.to_numeric(X[c], errors="coerce")
            X[c] = X[c].replace([np.inf, -np.inf], np.nan)  # 체인할당 제거
            X[c] = X[c].clip(-1e12, 1e12)
    return X

def _sentry(name: str, X: pd.DataFrame):
    has_inf = np.isinf(X.to_numpy()).any()
    all_nan = int(X.isna().all().sum())
    print(f"[SENTRY] {name}: has_inf={has_inf} | all-NaN cols={all_nan}")

X_test = _prep_xgb_features(test_clean, feature_cols)
_sentry("X_test", X_test)
print(f"[Prep] X_test: {X_test.shape}")

# ========= 모델 로드 =========
if not os.path.isdir(MODEL_DIR):
    raise FileNotFoundError(f"모델 폴더 없음: {MODEL_DIR}")

xgb_models = []
for fname in sorted(os.listdir(MODEL_DIR)):
    if fname.endswith(".json"):
        m = XGBClassifier()
        m.load_model(os.path.join(MODEL_DIR, fname))
        xgb_models.append(m)

if not xgb_models:
    raise RuntimeError(f".json 모델을 찾지 못함: {MODEL_DIR}")
print(f"[Models] loaded {len(xgb_models)} XGB models")

# ========= 추론(모든 모델 평균) =========
preds = []
for i, m in enumerate(xgb_models, 1):
    p = m.predict_proba(X_test)[:, 1]
    preds.append(p)
    if i % 5 == 0 or i == len(xgb_models):
        print(f"  infer {i}/{len(xgb_models)}")

pred_mean = np.mean(preds, axis=0)
pred_mean = np.clip(pred_mean, 1e-7, 1 - 1e-7)

# ========= 제출 저장 =========
submit = pd.read_csv(SAMPLE_SUB)
if "clicked" not in submit.columns:
    submit["clicked"] = 0.5
submit["clicked"] = pred_mean

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = os.path.join(SAVE_DIR, f"pred_xgb_infer_{ts}.csv")
submit.to_csv(save_path, index=False)

print("\n===================================")
print(f"[Saved] submission -> {save_path}")
print("===================================")

  train_features = pickle.load(f)
  test_features = pickle.load(f)


[Loaded] train: (612537, 180) | test: (1527298, 180)
[Clean] train: (612537, 173) | test: (1527298, 172)
[Features] using 172 columns. head -> ['age_group', 'day_of_week', 'feat_a_1', 'feat_a_10', 'feat_a_11', 'feat_a_12', 'feat_a_13', 'feat_a_14', 'feat_a_15', 'feat_a_16']
[SENTRY] X_test: has_inf=False | all-NaN cols=0
[Prep] X_test: (1527298, 172)
[Models] loaded 25 XGB models
  infer 5/25
  infer 10/25
  infer 15/25
  infer 20/25
  infer 25/25

[Saved] submission -> /content/drive/MyDrive/open/submit_xgb_infer_20250915_092544.csv


In [13]:
# -*- coding: utf-8 -*-
# =========================================================
# cat+xgb 앙상블 스크립트
# - preds 폴더에서 최신 예측 파일 자동 탐색 (cat/xgb)
# - (가능하면) ID 기준 정렬 후 가중 평균
# - 최종 제출 저장
# =========================================================

import os, re, glob
import numpy as np
import pandas as pd
from datetime import datetime

# ===== 경로/설정 =====
PREDS_DIR  = "/content/drive/MyDrive/open/preds"   # cat_infer.py, xgb_infer.py가 저장한 곳
SAMPLE_SUB = "/content/drive/MyDrive/open/sample_submission.csv"
SAVE_DIR   = "/content/drive/MyDrive/open"
os.makedirs(SAVE_DIR, exist_ok=True)

# 가중치 (cat:xgb = alpha:(1-alpha))
ALPHA = 0.5

# ===== 최신 파일 헬퍼 =====
def _latest(patterns):
    """여러 패턴에서 가장 최근 파일 1개 반환(없으면 None)"""
    files = []
    for p in patterns:
        files += glob.glob(os.path.join(PREDS_DIR, p))
    if not files:
        return None
    files.sort(key=os.path.getmtime)
    return files[-1]

# ===== 로더들 =====
def load_cat(path):
    """cat 예측 로드: .npy 또는 .csv(pred_cat 또는 clicked)"""
    if path.endswith(".npy"):
        arr = np.load(path)
        return pd.DataFrame({"pred_cat": arr})
    else:
        df = pd.read_csv(path)
        # 우선순위: pred_cat > clicked > pred
        for c in ["pred_cat", "clicked", "pred"]:
            if c in df.columns:
                out = pd.DataFrame({"pred_cat": df[c].values})
                if "ID" in df.columns:
                    out.insert(0, "ID", df["ID"].values)
                return out
        raise ValueError(f"Cat CSV에서 예측 컬럼을 못 찾음: {path}")

def load_xgb(path):
    """xgb 예측 로드: .npy 또는 .csv(pred_xgb/ clicked)"""
    if path.endswith(".npy"):
        arr = np.load(path)
        return pd.DataFrame({"pred_xgb": arr})
    else:
        df = pd.read_csv(path)
        # xgb infer가 submission을 바로 저장했을 수도 있음 → clicked 사용
        for c in ["pred_xgb", "clicked", "pred"]:
            if c in df.columns:
                out = pd.DataFrame({"pred_xgb": df[c].values})
                if "ID" in df.columns:
                    out.insert(0, "ID", df["ID"].values)
                return out
        raise ValueError(f"XGB CSV에서 예측 컬럼을 못 찾음: {path}")

# ===== 최신 파일 찾기 =====
cat_path = _latest(["pred_cat_*.npy", "pred_cat_*.csv"])
xgb_path = _latest(["pred_xgb_*.npy", "pred_xgb_*.csv", "submit_xgb_infer_*.csv"])

if cat_path is None:
    raise FileNotFoundError(f"Cat 예측 파일이 없습니다. ({PREDS_DIR}/pred_cat_*.npy|csv)")
if xgb_path is None:
    raise FileNotFoundError(f"XGB 예측 파일이 없습니다. ({PREDS_DIR}/pred_xgb_*.npy|csv 또는 submit_xgb_infer_*.csv)")

print(f"[Ensemble] use cat: {os.path.basename(cat_path)}")
print(f"[Ensemble] use xgb: {os.path.basename(xgb_path)}")

cat_df = load_cat(cat_path)
xgb_df = load_xgb(xgb_path)

# ===== 정렬/병합(가능하면 ID 기준) =====
if "ID" in cat_df.columns and "ID" in xgb_df.columns:
    # ID 기준 내부 정렬 후 merge inner
    cat_df = cat_df.sort_values("ID").reset_index(drop=True)
    xgb_df = xgb_df.sort_values("ID").reset_index(drop=True)
    ens = pd.merge(cat_df, xgb_df, on="ID", how="inner")
    if len(ens) == 0:
        raise RuntimeError("ID 기준 병합 결과가 비었습니다. ID가 일치하는지 확인하세요.")
else:
    # ID 없으면 길이 동일 가정
    if len(cat_df) != len(xgb_df):
        raise RuntimeError(f"ID 없음 + 길이 불일치: cat={len(cat_df)}, xgb={len(xgb_df)}")
    ens = pd.concat([cat_df.reset_index(drop=True), xgb_df.reset_index(drop=True)], axis=1)

# ===== 가중 평균 =====
if "pred_cat" not in ens.columns or "pred_xgb" not in ens.columns:
    # 혹시 컬럼명이 중복되어 '_x','_y'가 붙었으면 보정
    cand_cat = [c for c in ens.columns if "pred_cat" in c]
    cand_xgb = [c for c in ens.columns if "pred_xgb" in c]
    if len(cand_cat) == 1 and len(cand_xgb) == 1:
        ens = ens.rename(columns={cand_cat[0]: "pred_cat", cand_xgb[0]: "pred_xgb"})
    else:
        raise RuntimeError(f"예측 컬럼을 찾지 못함: {ens.columns.tolist()}")

final = ALPHA * ens["pred_cat"].values + (1 - ALPHA) * ens["pred_xgb"].values
final = np.clip(final, 1e-7, 1 - 1e-7)

# ===== 제출 저장 =====
sub = pd.read_csv(SAMPLE_SUB)
sub["clicked"] = final

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = os.path.join(SAVE_DIR, f"submit_ensemble_cat{int(ALPHA*100)}_xgb{int((1-ALPHA)*100)}_{ts}.csv")
sub.to_csv(save_path, index=False)

print("\n====================================")
print(f"[Saved] Ensemble submission -> {save_path}")
print("====================================")

[Ensemble] use cat: pred_cat_20250915_090901.csv
[Ensemble] use xgb: pred_xgb_infer_20250915_092544.csv

[Saved] Ensemble submission -> /content/drive/MyDrive/open/submit_ensemble_cat50_xgb50_20250915_092821.csv
