In [18]:
pip install -U pyarrow==15.0.2 fastparquet pandas imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [23]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, sys, re, json, subprocess, urllib.parse
import pandas as pd
import numpy as np

# ========= 경로 =========
INPUT_PARQUET = "/home/jovyan/processed_user_behavior.sorted.parquet"
OUTPUT_DIR    = "/home/jovyan/next_state_prediction"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========= 원격 config 확보(없으면 기본) =========
CONFIG_URL  = "https://raw.githubusercontent.com/kakaocloud-edu/tutorial/main/DataAnalyzeCourse/src/day1/Lab01/traffic_generator/config.py"
CONFIG_DIR  = "/home/jovyan/next_state_prediction"
CONFIG_FILE = os.path.join(CONFIG_DIR, "config.py")

os.makedirs(CONFIG_DIR, exist_ok=True)  # 폴더 없으면 생성

def _download_config():
    try:
        subprocess.run(["wget", "-q", "-O", CONFIG_FILE, CONFIG_URL], check=True)
        return True
    except Exception:
        try:
            subprocess.run(["curl", "-sSL", "-o", CONFIG_FILE, CONFIG_URL], check=True)
            return True
        except Exception:
            return False

if not os.path.exists(CONFIG_FILE):
    _ = _download_config()

sys.path.append(os.getcwd())
try:
    import config as CFG
    _HAS_CFG = True
except Exception:
    CFG = None
    _HAS_CFG = False

# ========= Parquet 엔진 =========
parquet_engine = None
try:
    import pyarrow; parquet_engine = "pyarrow"
except Exception:
    try:
        import fastparquet; parquet_engine = "fastparquet"
    except Exception:
        parquet_engine = None

# ========= 설정값 =========
AGE_YOUNG  = getattr(CFG, "AGE_THRESHOLD_YOUNG", 25) if _HAS_CFG else 25
AGE_MIDDLE = getattr(CFG, "AGE_THRESHOLD_MIDDLE", 50) if _HAS_CFG else 50
CATEGORY_PREFERENCE = getattr(CFG, "CATEGORY_PREFERENCE", {
    "F": {"young": ["Fashion", "Books"], "middle": ["Fashion", "Home", "Books"], "old": ["Home", "Books"]},
    "M": {"young": ["Electronics", "Gaming"], "middle": ["Electronics", "Home", "Gaming"], "old": ["Books"]},
})
BIAS_THRESHOLDS = getattr(CFG, "BIAS_THRESHOLDS", {
    "cart_has_items_min": 1, "heavy_cart_min": 3, "deep_page_min": 6,
    "long_session_sec": 60, "idle_slow_sec": 5,
})
CATEGORY_KEYWORDS = getattr(CFG, "CATEGORY_KEYWORDS", {
    "Electronics": ["mouse", "earbuds", "speaker", "laptop", "phone"],
    "Fashion": ["sneakers", "dress", "bag"],
    "Home": ["mug", "fryer", "coffee"],
    "Gaming": ["keyboard", "console", "headset"],
    "Books": ["book", "novel"],
})
CANONICAL_KEYWORD_MAP = getattr(CFG, "CANONICAL_KEYWORD_MAP", {
    "blu tooth": "bluetooth",
    "blu%20tooth": "bluetooth",
    "cofee": "coffee",
    "iphon": "iphone",
    "labtop": "laptop",
    "rayban": "sunglasses",
})

CATEGORY_KEYWORDS_LC = {k.strip().lower(): set(w.strip().lower() for w in v) for k, v in CATEGORY_KEYWORDS.items()}
GLOBAL_ALLOWED = set().union(*CATEGORY_KEYWORDS_LC.values()) if CATEGORY_KEYWORDS_LC else set()

# ========= 유틸 =========
def norm_keyword(x: str) -> str:
    if not isinstance(x, str): return ""
    x = urllib.parse.unquote(x).strip().lower()
    x = " ".join(x.split())
    return CANONICAL_KEYWORD_MAP.get(x, x)

def norm_category(x: str) -> str:
    if not isinstance(x, str): return ""
    return x.strip()

def norm_gender(x: str) -> str:
    if pd.isna(x) or not isinstance(x, str) or x.strip()=="":
        return "Unknown"
    xl = x.strip().lower()
    if xl in {"m","male","man"}: return "M"
    if xl in {"f","female","woman"}: return "F"
    return "Unknown"

def age_to_bucket(age: float) -> str:
    if pd.isna(age): return "unknown"
    try: a = float(age)
    except Exception: return "unknown"
    if a <= AGE_YOUNG: return "young"
    if a <= AGE_MIDDLE: return "middle"
    return "old"

def filter_keyword_by_category(kw: str, cat: str) -> str:
    if not kw: return ""
    cat_key = (cat or "").strip().lower()
    allowed = CATEGORY_KEYWORDS_LC.get(cat_key, GLOBAL_ALLOWED)
    return kw if kw in allowed else ""

def is_preferred_cat(g: str, age_b: str, cat: str) -> int:
    g = g if g in ("M","F") else None
    if not g or age_b not in ("young","middle","old"): return 0
    return int(cat in CATEGORY_PREFERENCE.get(g, {}).get(age_b, []))

# ========= 로드/정렬 =========
read_engine = "pyarrow" if parquet_engine=="pyarrow" else None
df = pd.read_parquet(INPUT_PARQUET, engine=read_engine)

required = ["session_id","timestamp","current_state","next_state",
            "search_count","cart_item_count","page_depth","product_id","search_keyword"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.sort_values(["session_id","timestamp"], kind="mergesort").reset_index(drop=True)

for c in ["search_count","cart_item_count","page_depth"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype("int32")

cat_col = "category_keyword" if "category_keyword" in df.columns else ("category_name" if "category_name" in df.columns else None)

# ========= 성별/연령 파생 =========
gender_std = (df["gender"].map(norm_gender).fillna("Unknown") if "gender" in df.columns
              else pd.Series(["Unknown"]*len(df), index=df.index))
age_bucket = (df["age"].map(age_to_bucket) if "age" in df.columns
              else pd.Series(["unknown"]*len(df), index=df.index))

# ========= 키워드/카테고리 정규화 + 허용 필터 =========
df["search_keyword_norm"] = df["search_keyword"].fillna("").map(norm_keyword)
if cat_col:
    cat_norm = df[cat_col].fillna("").map(norm_category)
    df["category_norm"] = cat_norm.replace("", np.nan)
else:
    df["category_norm"] = np.nan

df["search_keyword_norm"] = [
    filter_keyword_by_category(kw, (cat if isinstance(cat,str) else ""))
    for kw, cat in zip(df["search_keyword_norm"], df["category_norm"].fillna(""))
]
_kw_series = df["search_keyword_norm"].replace("", np.nan)

# ========= 플래그 =========
bt = BIAS_THRESHOLDS
flag_is_pref_category = pd.Series(
    [is_preferred_cat(g,a,(c if isinstance(c,str) else "")) for g,a,c in
     zip(gender_std.tolist(), age_bucket.tolist(), df["category_norm"].fillna("").tolist())],
    name="flag_is_pref_category", dtype="int8"
)
flag_has_cart_items = (df["cart_item_count"]>=bt.get("cart_has_items_min",1)).astype("int8").rename("flag_has_cart_items")
flag_heavy_cart     = (df["cart_item_count"]>=bt.get("heavy_cart_min",3)).astype("int8").rename("flag_heavy_cart")
flag_deep_page      = (df["page_depth"]     >=bt.get("deep_page_min",6)).astype("int8").rename("flag_deep_page")

last_elapsed = pd.to_numeric(df.get("last_action_elapsed",0), errors="coerce").fillna(0.0)
session_dur  = pd.to_numeric(df.get("session_duration",0),  errors="coerce").fillna(0.0)
flag_long_session = (session_dur>=bt.get("long_session_sec",60)).astype("int8").rename("flag_long_session")
flag_idle_slow    = (last_elapsed>=bt.get("idle_slow_sec",5)).astype("int8").rename("flag_idle_slow")

goal_tokens = ["/cart/add","/cart/view","/checkout"]
pattern = "|".join(map(re.escape, goal_tokens))
has_goal_event_now = (
    df["current_state"].astype(str).str.contains(pattern) |
    (df["cart_item_count"] >= bt.get("cart_has_items_min",1))
).astype("int8")
shifted = has_goal_event_now.groupby(df["session_id"]).shift(1).fillna(0).astype("int8")
flag_with_goal = shifted.groupby(df["session_id"]).cummax().astype("int8").rename("flag_with_goal")
flag_no_goal   = (1 - flag_with_goal).astype("int8").rename("flag_no_goal")

# ========= 추가: prev1/prev2 + 누적치 =========
prev1_state = df.groupby("session_id")["current_state"].shift(1).fillna("")
prev2_state = df.groupby("session_id")["current_state"].shift(2).fillna("")
prev1_state_ohe = pd.get_dummies(prev1_state, prefix="prev1_state", dtype="int8")
prev2_state_ohe = pd.get_dummies(prev2_state, prefix="prev2_state", dtype="int8")

# 순서 결합 피처
prevpair = (prev1_state.astype(str) + "→" + prev2_state.astype(str))
prevpair_ohe = pd.get_dummies(prevpair, prefix="prevpair_state", dtype="int8")

# 누적(shift 후 cumsum)
prev_search = df.groupby("session_id")["search_count"].shift(1).fillna(0).astype("int32")
cum_search  = prev_search.groupby(df["session_id"]).cumsum().astype("int32").rename("cum_search")
prev_page   = df.groupby("session_id")["page_depth"].shift(1).fillna(0).astype("int32")
cum_page    = prev_page.groupby(df["session_id"]).cumsum().astype("int32").rename("cum_page")
has_cart_now     = (df["cart_item_count"]>=1).astype("int8")
prev_has_cart    = has_cart_now.groupby(df["session_id"]).shift(1).fillna(0).astype("int8")
ever_cart_flag   = prev_has_cart.groupby(df["session_id"]).cummax().astype("int8").rename("ever_cart_flag")

# ========= OHE =========
state_ohe  = pd.get_dummies(df["current_state"].fillna(""), prefix="state", dtype="int8")
search_ohe = pd.get_dummies(_kw_series, prefix="keyword", dtype="int8")
cat_ohe    = pd.get_dummies(df["category_norm"], prefix="category", dtype="int8") if ("category_norm" in df) else pd.DataFrame(index=df.index)
gender_ohe = pd.get_dummies(gender_std, prefix="gender", dtype="int8")
age_ohe    = pd.get_dummies(age_bucket, prefix="age", dtype="int8")
has_product = df["product_id"].notna().astype("int8").rename("has_product_id")
counters    = df[["search_count","cart_item_count","page_depth"]]

# ========= 결합 =========
X_parts = [
    state_ohe, has_product, counters, search_ohe, cat_ohe, gender_ohe, age_ohe,
    flag_is_pref_category, flag_has_cart_items, flag_heavy_cart, flag_deep_page,
    flag_long_session, flag_idle_slow, flag_with_goal, flag_no_goal,
    prev1_state_ohe, prev2_state_ohe, prevpair_ohe,
    cum_search, cum_page, ever_cart_flag,
]
X = pd.concat(X_parts, axis=1)

# 중복 제거/형 변환
if not X.columns.is_unique:
    X = X.loc[:, ~X.columns.duplicated(keep="first")]
bool_cols = X.select_dtypes(include=["bool"]).columns
if len(bool_cols)>0:
    X[bool_cols] = X[bool_cols].astype("int8")

y = pd.DataFrame({"next_state": df["next_state"].astype(str).fillna("")})

# ========= 전이가능 맵 저장 =========
trans_map = (df[["current_state","next_state"]]
             .dropna()
             .astype(str)
             .groupby("current_state")["next_state"]
             .agg(lambda s: sorted(set(s.tolist())))
             .to_dict())
with open(os.path.join(OUTPUT_DIR, "transition_map.json"), "w", encoding="utf-8") as f:
    json.dump(trans_map, f, ensure_ascii=False, indent=2)

# ========= 저장 =========
features_csv = os.path.join(OUTPUT_DIR, "next_state_features.csv")
labels_csv   = os.path.join(OUTPUT_DIR, "next_state.csv")
X.to_csv(features_csv, index=False)
y.to_csv(labels_csv,   index=False)

features_parquet = os.path.join(OUTPUT_DIR, "next_state_features.parquet")
labels_parquet   = os.path.join(OUTPUT_DIR, "next_state.parquet")

def _try_save_parquet(df_obj, path, engine):
    df_obj.to_parquet(path, index=False, engine=engine)

features_parquet_saved = False; labels_parquet_saved = False
err_feat = err_label = None
if parquet_engine:
    try:
        _try_save_parquet(X, features_parquet, parquet_engine); features_parquet_saved = True
    except Exception as e:
        err_feat = f"[{parquet_engine}] {type(e).__name__}: {e}"
    try:
        _try_save_parquet(y, labels_parquet, parquet_engine); labels_parquet_saved = True
    except Exception as e:
        err_label = f"[{parquet_engine}] {type(e).__name__}: {e}"

if not features_parquet_saved:
    alt = "fastparquet" if parquet_engine=="pyarrow" else "pyarrow"
    try:
        __import__(alt)
        _try_save_parquet(X, features_parquet, alt); features_parquet_saved = True
        print(f"[INFO] features를 {alt}로 폴백 저장 성공")
    except Exception as e:
        pass

print("[Done] Preprocessing finished.")
print("Saved to:", OUTPUT_DIR)


[Done] Preprocessing finished.
Saved to: /home/jovyan/next_state_prediction


In [24]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os, json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, top_k_accuracy_score
from lightgbm import LGBMClassifier
# 콜백(구버전 호환용): 없으면 AttributeError가 날 수 있어 try/except 처리
try:
    from lightgbm import early_stopping, log_evaluation
    HAS_LGBM_CALLBACKS = True
except Exception:
    HAS_LGBM_CALLBACKS = False
import joblib

# ===== 경로/설정 =====
BASE_DIR = "/home/jovyan/datasets/next_state_prediction"
FEAT = os.path.join(BASE_DIR, "next_state_features.parquet")
LAB  = os.path.join(BASE_DIR, "next_state.parquet")
TRANSITION = os.path.join(BASE_DIR, "transition_map.json")

USE_OVERSAMPLING = True          # imblearn 없이 자체 오버샘플링
CLASS_WEIGHT_MODE = None         # oversampling 켜면 None 권장
RARE_MIN_COUNT = 10              # 희소 OHE 컬럼 드롭 임계치
RANDOM_STATE = 42
TEST_SIZE = 0.2
MASK_TOPK = 3
MASK_THRESH = 0.15

rng = np.random.default_rng(RANDOM_STATE)

# ===== 유틸: 희소 OHE 열 드롭 =====
def drop_rare_ohe(X, min_count=10):
    bin_prefixes = ("prevpair_state_", "prev1_state_", "prev2_state_", "keyword_", "category_")
    bin_cols = [c for c in X.columns if c.startswith(bin_prefixes)]
    keep, drop = [], []
    for c in bin_cols:
        cnt = int(X[c].sum())
        (keep if cnt >= min_count else drop).append(c)
    if drop:
        print(f"[INFO] Drop rare OHE columns: n={len(drop)} (min_count={min_count})")
        X = X.drop(columns=drop)
    return X, drop

# ===== 유틸: 내장 오버샘플링 =====
def simple_random_oversample(X: pd.DataFrame, y: np.ndarray, random_state=42, max_target=None):
    """
    각 클래스 샘플 수를 max_count(또는 max_target)까지 무작위 복제해 균형화.
    """
    rs = np.random.default_rng(random_state)
    y = np.asarray(y)
    classes, counts = np.unique(y, return_counts=True)
    max_count = int(counts.max()) if max_target is None else int(max_target)

    parts_X, parts_y = [], []
    for cls, cnt in zip(classes, counts):
        idx = np.where(y == cls)[0]
        if cnt < max_count:
            add_n = max_count - cnt
            add_idx = rs.choice(idx, size=add_n, replace=True)
            take_idx = np.concatenate([idx, add_idx])
        else:
            take_idx = idx
        parts_X.append(X.iloc[take_idx])
        parts_y.append(y[take_idx])

    X_os = pd.concat(parts_X, axis=0).reset_index(drop=True)
    y_os = np.concatenate(parts_y, axis=0)
    # 셔플
    perm = rs.permutation(len(y_os))
    return X_os.iloc[perm].reset_index(drop=True), y_os[perm]

# ===== 데이터 로드 =====
X = pd.read_parquet(FEAT)
y_raw = pd.read_parquet(LAB)["next_state"].astype(str)

# 희소 OHE 드롭
X, dropped_cols = drop_rare_ohe(X, RARE_MIN_COUNT)

# 라벨 인코딩
le = LabelEncoder()
y = le.fit_transform(y_raw)
classes = le.classes_
num_class = len(classes)

# 오버샘플링 or class_weight
if USE_OVERSAMPLING:
    X_res, y_res = simple_random_oversample(X, y, random_state=RANDOM_STATE, max_target=None)
    class_weight = None
else:
    X_res, y_res = X, y
    if CLASS_WEIGHT_MODE == "balanced":
        class_weight = "balanced"
    elif CLASS_WEIGHT_MODE == "custom":
        vals, cnts = np.unique(y_res, return_counts=True)
        inv = {int(v): float(1.0/c) for v, c in zip(vals, cnts)}
        s = sum(inv.values()); inv = {k: v*s/len(inv) for k, v in inv.items()}
        class_weight = inv
    else:
        class_weight = None

# split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_res, y_res, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_res
)

# ===== 모델 정의 =====
clf = LGBMClassifier(
    objective="multiclass",
    learning_rate=0.05,
    n_estimators=4000,        # 콜백으로 조기종료
    num_leaves=31,
    max_depth=10,
    min_data_in_leaf=50,
    min_gain_to_split=1e-3,
    subsample=0.8,            # bagging_fraction
    colsample_bytree=0.8,     # feature_fraction
    reg_alpha=0.1,
    reg_lambda=1.0,
    class_weight=class_weight,
    n_jobs=-1,
)

# 일부 버전은 fit(verbose=..) 미지원 → 파라미터로 끄고 콜백 로그 사용
try:
    clf.set_params(verbose=-1)
except Exception:
    pass

# ===== 학습 (콜백 기반 ES/로그) =====
fit_kwargs = dict(
    X=X_tr, y=y_tr,
    eval_set=[(X_te, y_te)],
    eval_metric="multi_logloss",
)
if HAS_LGBM_CALLBACKS:
    fit_kwargs["callbacks"] = [
        early_stopping(stopping_rounds=200, verbose=True),
        log_evaluation(period=100),
    ]

clf.fit(**fit_kwargs)

# best_iteration 안전 획득
best_iter = getattr(clf, "best_iteration_", None)
print("best_iter:", best_iter)

# ===== 평가 (num_iteration 호환) =====
def predict_with_iter(model, X, *, raw_score=False):
    kw = {}
    if best_iter is not None:
        kw["num_iteration"] = best_iter
    return model.predict(X, raw_score=raw_score, **kw)

def predict_proba_with_iter(model, X):
    kw = {}
    if best_iter is not None:
        kw["num_iteration"] = best_iter
    return model.predict_proba(X, **kw)

proba = predict_proba_with_iter(clf, X_te)
y_pred = predict_with_iter(clf, X_te)

top1 = top_k_accuracy_score(y_te, proba, k=1, labels=list(range(num_class)))
top3 = top_k_accuracy_score(y_te, proba, k=min(3, num_class), labels=list(range(num_class)))
macro_f1 = f1_score(y_te, y_pred, average="macro")
print({"Top-1": round(top1,4), "Top-3": round(top3,4), "MacroF1": round(macro_f1,4)})

# ===== 전이 마스킹 평가 =====
with open(TRANSITION, "r", encoding="utf-8") as f:
    trans_map = json.load(f)

# state_*은 “드롭 이후” 사용 컬럼에서 추출
state_cols = [c for c in X_tr.columns if c.startswith("state_")]

def states_from_ohe(Xpart):
    idx = Xpart[state_cols].values.argmax(axis=1)
    return [state_cols[i].replace("state_","") for i in idx]

curr_states_te = states_from_ohe(X_te)

def masked_topk_metrics(model, X_eval, y_eval, curr_states, trans_map, classes, K=3, thr=0.15):
    logits = predict_with_iter(model, X_eval, raw_score=True)
    logits = np.asarray(logits, dtype=np.float64)

    # 불가능 전이 -inf 마스킹
    for i, cur in enumerate(curr_states):
        allowed = set(trans_map.get(cur, []))
        if allowed:
            disallowed = [j for j, lab in enumerate(classes) if lab not in allowed]
            logits[i, disallowed] = -np.inf

    # softmax
    logits -= np.max(logits, axis=1, keepdims=True)
    proba = np.exp(logits); proba /= proba.sum(axis=1, keepdims=True)

    topk_idx = np.argsort(-proba, axis=1)[:, :min(K, proba.shape[1])]
    y_pred_top1 = topk_idx[:, 0]
    acc1 = (y_pred_top1 == y_eval).mean()
    in_topk = (topk_idx == y_eval.reshape(-1,1)).any(axis=1).mean()

    picked_ok = 0; picked_total = 0
    for i in range(proba.shape[0]):
        picks = [(j, proba[i, j]) for j in topk_idx[i] if proba[i, j] >= thr]
        picked_total += len(picks)
        picked_ok += any(j == y_eval[i] for j,_ in picks)

    return {
        "Top-1(masked)": round(float(acc1),4),
        f"Top-{K}(masked)": round(float(in_topk),4),
        "picked@thr_contains_true": round(float(picked_ok/len(y_eval)),4),
        "avg_picks_per_row": round(float(picked_total/len(y_eval)),3),
    }

masked_scores = masked_topk_metrics(
    clf, X_te, y_te, curr_states_te, trans_map, classes=classes, K=MASK_TOPK, thr=MASK_THRESH
)
print(masked_scores)

# ===== 아티팩트 저장 =====
ART_DIR = os.path.join(BASE_DIR, "next_state_prediction_model")
os.makedirs(ART_DIR, exist_ok=True)

# 모델/라벨인코더 저장 (확장자 .joblib)
joblib.dump(clf, os.path.join(ART_DIR, "lgbm_next_state.joblib"))
joblib.dump(le,  os.path.join(ART_DIR, "label_encoder.joblib"))

# 학습 스키마(열 순서) 저장 -> 추론 정렬용
with open(os.path.join(ART_DIR, "feature_columns.json"), "w", encoding="utf-8") as f:
    json.dump(list(X_tr.columns), f, ensure_ascii=False, indent=2)

with open(os.path.join(ART_DIR, "dropped_rare_columns.json"), "w", encoding="utf-8") as f:
    json.dump(dropped_cols, f, ensure_ascii=False, indent=2)

print("Saved model artifacts to:", ART_DIR)


[INFO] Drop rare OHE columns: n=52 (min_count=10)
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.94595
[200]	valid_0's multi_logloss: 0.785777
[300]	valid_0's multi_logloss: 0.698777
[400]	valid_0's multi_logloss: 0.649222
[500]	valid_0's multi_logloss: 0.617959
[600]	valid_0's multi_logloss: 0.59887
[700]	valid_0's multi_logloss: 0.585899
[800]	valid_0's multi_logloss: 0.576227
[900]	valid_0's multi_logloss: 0.569838
[1000]	valid_0's multi_logloss: 0.565749
[1100]	valid_0's multi_logloss: 0.562047
[1200]	valid_0's multi_logloss: 0.558827
[1300]	valid_0's multi_logloss: 0.556594
[1400]	valid_0's multi_logloss: 0.554719
[1500]	valid_0's multi_logloss: 0.553609
[1600]	valid_0's multi_logloss: 0.553223
[1700]	valid_0's multi_logloss: 0.553055
[1800]	valid_0's multi_logloss: 0.552367
[1900]	valid_0's multi_logloss: 0.551988
[2000]	valid_0's multi_logloss: 0.551769
[2100]	valid_0's multi_logloss: 0.551657
[2200]	valid_0's multi_logloss: 0.5515