# Mitsui LightGBM Inference (No Recency Weight)

In [None]:
!pip install -q lightgbm numpy pandas polars pyarrow scikit-learn gplearn TA-Lib joblib

In [None]:
from pathlib import Path
import sys
import zipfile
import shutil
from datetime import date

BUNDLE_ROOT = Path("/kaggle/input/mitsui-lightgbm-training-lag1lag4")
if not BUNDLE_ROOT.exists():
    raise FileNotFoundError("Dataset mitsui-lightgbm-training-lag1lag4 not found")

def _resolve_bundle_root(base: Path) -> Path:
    if base.is_file():
        extract_dir = Path("/kaggle/working/mitsui_lightgbm_bundle")
        extract_dir.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(base, "r") as zf:
            zf.extractall(extract_dir)
        base = extract_dir

    src_dir = base / "src"
    if src_dir.exists():
        return base

    zip_candidates = sorted(base.glob("*.zip"))
    if zip_candidates:
        extract_dir = Path("/kaggle/working/mitsui_lightgbm_bundle")
        extract_dir.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(zip_candidates[0], "r") as zf:
            zf.extractall(extract_dir)
        if (extract_dir / "src").exists():
            return extract_dir
        subdirs = [p for p in extract_dir.iterdir() if p.is_dir()]
        for candidate in subdirs:
            if (candidate / "src").exists():
                return candidate
        raise FileNotFoundError(f"Extracted bundle missing src directory: {extract_dir}")

    subdirs = [p for p in base.iterdir() if p.is_dir()]
    for candidate in subdirs:
        if (candidate / "src").exists():
            return candidate

    raise FileNotFoundError(f"Could not locate src directory under {base}")

def _inject_sys_path(root: Path) -> Path:
    src_root = root / "src"
    if not src_root.exists() and root.name == "src":
        src_root = root
        root = root.parent

    candidates = []
    if root.exists():
        candidates.append(root)
    if src_root.exists():
        candidates.append(src_root)

    for candidate in reversed(candidates):
        path_str = str(candidate)
        if path_str not in sys.path:
            sys.path.insert(0, path_str)
    return src_root if src_root.exists() else root

BUNDLE_ROOT = _resolve_bundle_root(BUNDLE_ROOT)
SRC_ROOT = _inject_sys_path(BUNDLE_ROOT)

if not SRC_ROOT.exists():
    raise FileNotFoundError(f"src directory missing under {BUNDLE_ROOT}")

from src import config
CONFIG_ROOT = Path("/kaggle/input/mitsui-commodity-prediction-challenge")
config.DATA_DIR = CONFIG_ROOT
config.TRAIN_PATH = CONFIG_ROOT / "train.csv"
config.TEST_PATH = CONFIG_ROOT / "test.csv"
config.TRAIN_LABELS_PATH = CONFIG_ROOT / "train_labels.csv"
config.TARGET_PAIRS_PATH = CONFIG_ROOT / "target_pairs.csv"
config.LAGGED_TEST_LABELS_DIR = CONFIG_ROOT / "lagged_test_labels"
config.OUTPUT_DIR = Path("/kaggle/working/artifacts")
config.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

FEATURE_SOURCE = BUNDLE_ROOT / "artifacts" / "features_v2"
FEATURE_DEST = config.OUTPUT_DIR / "features_v2"
if FEATURE_SOURCE.exists():
    FEATURE_DEST.mkdir(parents=True, exist_ok=True)
    for item in FEATURE_SOURCE.iterdir():
        dest = FEATURE_DEST / item.name
        if item.is_dir():
            shutil.copytree(item, dest, dirs_exist_ok=True)
        elif item.is_file():
            shutil.copy2(item, dest)
else:
    raise FileNotFoundError(f"Feature artifacts missing: {FEATURE_SOURCE}")


In [None]:
import json
import os

from typing import Dict

import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl

from src.data.cleaning import fill_dataframe_with_trend
from src.data.loading import load_price_data
from src.features.online import compute_latest_features, load_online_context


class _SilentLogger:
    def info(self, msg: str) -> None:
        pass
    def warning(self, msg: str) -> None:
        pass

lgb.register_logger(_SilentLogger())

ARTIFACT_ROOT = Path("/kaggle/working/mitsui_lightgbm_bundle") / "artifacts" / "lightgbm_full"
if not ARTIFACT_ROOT.exists():
    ARTIFACT_ROOT = BUNDLE_ROOT / "artifacts" / "lightgbm_full"

NUM_TARGETS = 424
OFFLINE_EVAL_START_DATE_ID = 1827
COMPETITION_GO_LIVE_DATE = date(2025, 10, 7)
PRECOMPUTED_MIN_DATE_ID = None
PRECOMPUTED_MAX_DATE_ID = None


def _load_precomputed_panel() -> pd.DataFrame | None:
    panel_candidates = [
        config.OUTPUT_DIR / "features_v2" / "all_train.parquet",
        config.OUTPUT_DIR / "features_v2" / "all_train.pkl",
        config.OUTPUT_DIR / "features_v2" / "all.pkl",
        config.OUTPUT_DIR / "features_v2" / "all_test.pkl",
    ]
    for candidate in panel_candidates:
        if candidate.exists():
            if candidate.suffix == ".parquet":
                panel = pd.read_parquet(candidate)
            else:
                panel = pd.read_pickle(candidate)
            if not panel.empty:
                panel.index = panel.index.astype(int)
                panel = panel[panel.index >= OFFLINE_EVAL_START_DATE_ID]
                return panel.sort_index()
    return None


PRECOMPUTED_PANEL = _load_precomputed_panel()
if PRECOMPUTED_PANEL is not None:
    PRECOMPUTED_PANEL.index = PRECOMPUTED_PANEL.index.astype(int)
    PRECOMPUTED_PANEL = PRECOMPUTED_PANEL.sort_index()
    PRECOMPUTED_MIN_DATE_ID = int(PRECOMPUTED_PANEL.index.min())
    PRECOMPUTED_MAX_DATE_ID = int(PRECOMPUTED_PANEL.index.max())
USE_PRECOMPUTED_PANEL = PRECOMPUTED_PANEL is not None

context = None
price_history_df = None


def _ensure_online_state() -> None:
    global context, price_history_df
    if context is None or price_history_df is None:
        context = load_online_context()
        price_history_df = load_price_data("train").drop_duplicates(subset='date_id', keep='last').sort_values('date_id')
        price_history_df = fill_dataframe_with_trend(price_history_df, window=5, skip_columns=['date_id', 'is_scored'])


In [None]:
def _latest_run_dir(lag: int) -> Path:
    runs = sorted((ARTIFACT_ROOT / f"lag_{lag}").iterdir(), key=lambda p: p.stat().st_mtime)
    if not runs:
        raise FileNotFoundError(f"No trained runs found for lag {lag}")
    return runs[-1]


def _load_models() -> Dict[int, Dict[str, lgb.Booster]]:
    boosters: Dict[int, Dict[str, lgb.Booster]] = {}
    for lag in range(1, 5):
        run_dir = _latest_run_dir(lag)
        model_dir = run_dir / "models"
        boosters[lag] = {}
        for model_path in sorted(model_dir.glob("target_*.txt")):
            boosters[lag][model_path.stem] = lgb.Booster(model_file=str(model_path))
    return boosters


def _load_preprocessors() -> Dict[int, Dict[str, Dict[str, float] | list[str]]]:
    preprocessors: Dict[int, Dict[str, Dict[str, float] | list[str]]] = {}
    for lag in range(1, 5):
        run_dir = _latest_run_dir(lag)
        prep_dir = run_dir / "preprocessing"
        sample_file = next(prep_dir.glob("target_*.json"))
        payload = json.loads(sample_file.read_text())
        preprocessors[lag] = {
            "feature_names": payload["feature_names"],
            "median": payload["median"],
        }
    return preprocessors


BOOSTERS = _load_models()
PREPROCESSORS = _load_preprocessors()
TARGETS_BY_LAG = {lag: sorted(models.keys()) for lag, models in BOOSTERS.items()}
ALL_TARGETS = sorted({t for names in TARGETS_BY_LAG.values() for t in names})


In [None]:
def _prepare_features(lag: int, feature_row: pd.DataFrame) -> pd.DataFrame:
    names = PREPROCESSORS[lag]["feature_names"]
    medians = PREPROCESSORS[lag]["median"]
    aligned = feature_row.reindex(columns=names)
    aligned = aligned.fillna(medians)
    return aligned


def _predict_lag(lag: int, features: pd.DataFrame) -> Dict[str, float]:
    models = BOOSTERS[lag]
    feature_array = features.to_numpy(dtype=np.float32, copy=False)
    outputs: Dict[str, float] = {}
    for target, booster in models.items():
        value = booster.predict(feature_array, num_iteration=booster.best_iteration)[0]
        outputs[target] = float(value)
    return outputs


In [None]:
def predict(test: pl.DataFrame,
            label_lags_1_batch: pl.DataFrame,
            label_lags_2_batch: pl.DataFrame,
            label_lags_3_batch: pl.DataFrame,
            label_lags_4_batch: pl.DataFrame) -> pd.DataFrame:
    global price_history_df, PRECOMPUTED_PANEL, USE_PRECOMPUTED_PANEL, context

    test_pdf = test.to_pandas()
    latest_date = int(test_pdf['date_id'].max())

    feature_row = None
    if USE_PRECOMPUTED_PANEL and PRECOMPUTED_PANEL is not None:
        if latest_date in PRECOMPUTED_PANEL.index:
            feature_row = PRECOMPUTED_PANEL.loc[[latest_date]]
        else:
            USE_PRECOMPUTED_PANEL = False
            PRECOMPUTED_PANEL = None

    if feature_row is None:
        _ensure_online_state()
        price_history_df = pd.concat([price_history_df, test_pdf], ignore_index=True)
        price_history_df = fill_dataframe_with_trend(price_history_df, window=5, skip_columns=['date_id', 'is_scored'])
        price_history_df = price_history_df.drop_duplicates(subset='date_id', keep='last').sort_values('date_id')
        feature_row = compute_latest_features(price_history_df, context)

    outputs: Dict[str, float] = {target: 0.0 for target in ALL_TARGETS}
    for lag in range(1, 5):
        feats = _prepare_features(lag, feature_row)
        preds = _predict_lag(lag, feats)
        outputs.update(preds)

    row = [outputs[target] for target in ALL_TARGETS]
    return pd.DataFrame([row], columns=ALL_TARGETS)


In [None]:
import kaggle_evaluation.mitsui_inference_server

def _run_server() -> None:
    data_paths = ('/kaggle/input/mitsui-commodity-prediction-challenge/',)
    inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        inference_server.serve()
    else:
        inference_server.run_local_gateway(data_paths)

_run_server()

submission_path = Path('submission.parquet')
if not submission_path.exists():
    placeholder = pd.DataFrame({name: [0.0] for name in ALL_TARGETS})
    placeholder.to_parquet(submission_path, index=False)
    print('Wrote placeholder submission to', submission_path)
