In [1]:
# Environment setup
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import warnings
warnings.filterwarnings('ignore')

print("✅ Environment configured")

✅ Environment configured


In [2]:
import time
import joblib
import numpy as np
import pandas as pd

# GPU libraries
import cudf
import cupy as cp

import nvtabular as nvt
from merlin.io import Dataset  # merlin-core

import xgboost as xgb

print("✅ All libraries imported successfully")

✅ All libraries imported successfully


In [3]:
# Configuration(DATA PATH)

# MODEL_PATH = "./xgb_model.pkl"
WORKFLOW_PATH = "/workspace/output/workflow"
INFER_DATA_PATH = "test.parquet"  # 추론할 parquet 데이터
USE_GPU = True

print(f"📋 Current PATH: {os.getcwd()}")

📋 Current PATH: /workspace


In [4]:
df = pd.read_parquet("test.parquet")

In [5]:
df.head()

Unnamed: 0,ID,gender,age_group,inventory_id,day_of_week,hour,seq,l_feat_1,l_feat_2,l_feat_3,...,history_b_21,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30
0,TEST_0000000,2.0,6.0,46,7,13,"321,269,57,516,479,516,57,479,35,57,516,403,45...",2.0,2.0,2.0,...,0.008702,0.071199,0.071199,0.011866,0.004747,0.044302,0.05063,0.015822,0.041137,0.104432
1,TEST_0000001,2.0,8.0,29,7,21,"57,35,479,57,463,212,193,151,463,193,74,77,207...",2.0,2.0,2.0,...,0.024553,0.200889,0.200889,0.033482,0.013393,0.124998,0.142854,0.044642,0.116069,0.073659
2,TEST_0000002,1.0,6.0,37,7,19,"57,516,97,74,527,77,318,315,317,311,269,479,57...",2.0,2.0,3.0,...,0.021739,0.177867,0.177867,0.029645,0.011858,0.110673,0.126483,0.039526,0.102768,0.065218
3,TEST_0000003,2.0,7.0,41,7,9,"144,321,57,479,57,479,35,57,516,165,74,527,318...",2.0,2.0,2.0,...,0.006614,0.108234,0.054117,0.009019,0.003608,0.033673,0.038483,0.012026,0.031268,0.039686
4,TEST_0000004,1.0,8.0,2,7,18,"269,57,516,342,516,403,173,457,343,403,457,173...",2.0,2.0,3.0,...,0.006532,0.053442,0.053442,0.008907,0.003563,0.033253,0.038003,0.011876,0.030878,0.039191


In [6]:
del df

In [7]:
# Workflow load
workflow = nvt.Workflow.load(WORKFLOW_PATH)

# 워크플로우에서 'clicked' 제거
workflow.remove_inputs(["clicked"])

<nvtabular.workflow.workflow.Workflow at 0x7f6fbd41b850>

In [8]:
# Inference data load

dataset = Dataset(INFER_DATA_PATH, engine='parquet', part_size="8MB", cpu=True)
print("✅ Dataset created")

✅ Dataset created


In [9]:
# 전처리 (학습 시 사용한 workflow 적용)
print("🚀 Applying workflow transform...")
transformed = workflow.transform(dataset).to_ddf()  # 전체 데이터 사용
# transformed = workflow.transform(dataset).to_ddf().sample(frac=0.01, random_state=42)  # frac으로 샘플링 비율 조절

print("✅ Transform created")

🚀 Applying workflow transform...
✅ Transform created


In [10]:
# 피처 선택
feature_cols = [col for col in transformed.columns if col != "clicked"]
X_infer = transformed[feature_cols]  # cpu load로 DataFrame

In [11]:
# 모델 로드 (h:4, l:3 fold)
# model = joblib.load("xgb_model_fold3.pkl")

# 앙상블 평균
models = [joblib.load(f"xgb_model_fold{i}.pkl") for i in range(1, 6)]
print(models)

[<xgboost.core.Booster object at 0x7f6c846994e0>, <xgboost.core.Booster object at 0x7f6c848e8f40>, <xgboost.core.Booster object at 0x7f6c848e8ee0>, <xgboost.core.Booster object at 0x7f6c848e9000>, <xgboost.core.Booster object at 0x7f6c848e8580>]


In [12]:
# XGBoost DMatrix 변환 & 예측
dtest = xgb.DMatrix(X_infer)

start = time.time()
# y_pred = model.predict(dtest)
y_pred = np.mean([model.predict(xgb.DMatrix(X_infer)) for model in models], axis=0)  # 앙상블 평균

print("✅ Inference complete!")
print(f"⏱️ Time: {time.time() - start:.1f}s")  # 142.1s in RTX A6000

✅ Inference complete!
⏱️ Time: 144.6s


In [13]:
y_pred

array([0.30370086, 0.31616282, 0.4163166 , ..., 0.14769891, 0.22845201,
       0.29119357], dtype=float32)

In [14]:
n_samples = len(y_pred)
n_samples

1527298

In [15]:
# ID 생성: TEST_0000001, TEST_0000002, ...
ids = [f"TEST_{i:07d}" for i in range(n_samples)]

In [16]:
# 제출용 데이터프레임 생성
submission = pd.DataFrame({
    "ID": ids,
    "clicked": y_pred
})

In [17]:
submission

Unnamed: 0,ID,clicked
0,TEST_0000000,0.303701
1,TEST_0000001,0.316163
2,TEST_0000002,0.416317
3,TEST_0000003,0.477071
4,TEST_0000004,0.307208
...,...,...
1527293,TEST_1527293,0.363867
1527294,TEST_1527294,0.451266
1527295,TEST_1527295,0.147699
1527296,TEST_1527296,0.228452


In [18]:
# Save

# submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved")

✅ submission.csv saved
