# # China Real Estate Demand Prediction — Pipeline Skeleton
#
# **Last generated:** 2025-10-06 07:47 UTC
#
# This notebook/script is a lean, safe baseline skeleton tailored to the Kaggle
# competition *"Real Estate Demand Prediction"*.
#
# ### What you get
# - Robust **IO setup** targeting Kaggle input paths
# - **Custom competition metric** (two-stage MAPE-based)
# - **Leakage-safe** rolling time-grouped cross-validation
# - Minimal **feature factory** with lags/rollings (extend here)
# - **Naïve** baseline (strong Stage-1 shield)
# - **LightGBM Tweedie** model scaffold + optional XGB/CatBoost hooks
# - **Blending + clipping** for metric safety
# - **Submission writer** preserving `test.csv` row order
#
# > Notes:
# > - Extend features in the marked sections. Keep temporal embargo ≥ max lag.
# > - If the official epsilon/edge-case handling differs, update the metric function accordingly.
#
# ---

In [3]:
import os, gc, sys, math, warnings, itertools, json
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

try:
    import lightgbm as lgb
except Exception as e:
    lgb = None

try:
    import xgboost as xgb
except Exception as e:
    xgb = None

try:
    from catboost import CatBoostRegressor, Pool as CatPool
except Exception as e:
    CatBoostRegressor, CatPool = None, None

In [4]:
class CFG:
    seed = 42
    n_folds = 5
    embargo_months = 3             # >= max lag window used in features
    use_target_lags = True         # if True, recursive inference to populate target lags
    use_tweedie = True             # main LightGBM objective
    lgb_params = dict(
        objective = "tweedie",
        tweedie_variance_power = 1.3,  # tune 1.3~1.7
        metric = "mae",
        num_leaves = 63,
        max_depth = 8,
        learning_rate = 0.05,
        feature_fraction = 0.6,
        bagging_fraction = 0.8,
        bagging_freq = 1,
        min_data_in_leaf = 128,
        lambda_l1 = 0.0,
        lambda_l2 = 5.0,
        n_estimators = 3000,
        verbose = -1,
        random_state = seed
    )

    data_dir = "/kaggle/input/china-real-estate-demand-prediction"
    out_dir  = "/kaggle/working"
    target_col = "amount_new_house_transactions"  # train target column name
    id_col     = "id"
    month_col  = "month"
    sector_col = "sector"
    sector_int_col = "sector_int"
    # files
    files = dict(
        new_house="train/new_house_transactions.csv",
        new_house_nb="train/new_house_transactions_nearby_sectors.csv",
        pre_owned="train/pre_owned_house_transactions.csv",
        pre_owned_nb="train/pre_owned_house_transactions_nearby_sectors.csv",
        land="train/land_transactions.csv",
        land_nb="train/land_transactions_nearby_sectors.csv",
        poi="train/sector_POI.csv",
        search="train/city_search_index.csv",
        city="train/city_indexes.csv",
        test="test.csv",
        sample="sample_submission.csv"
    )

In [None]:
def set_seed(seed:int=42):
    import random
    random.seed(seed); np.random.seed(seed)

def month_to_timestamp(m: str) -> pd.Timestamp:
    """
    Parse month strings like '2019 Jan' to Timestamp at month-end.
    """
    # there might be localized month abbreviations; try a few formats
    for fmt in ["%Y %b", "%Y-%m", "%b %Y"]:
        try:
            return pd.to_datetime(m, format=fmt) + pd.offsets.MonthEnd(0)
        except Exception:
            pass
    # fallback
    return pd.to_datetime(m) + pd.offsets.MonthEnd(0)

def parse_sector(val) -> int:
    """
    Convert 'sector 3' or 3 -> int 3.
    """
    if pd.isna(val): return np.int16(-1)
    if isinstance(val, (int, np.integer)): return int(val)
    s = str(val).strip().lower()
    for token in ["sector", "_", "-"]:
        s = s.replace(token, " ")
    parts = s.split()
    nums = [p for p in parts if p.isdigit()]
    return int(nums[-1]) if nums else int(float(parts[-1])) if parts else -1

def ensure_cols(df: pd.DataFrame, cols: List[str]):
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    return df

def reduce_mem(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if pd.api.types.is_float_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], downcast="float")
        elif pd.api.types.is_integer_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], downcast="integer")
    return df


In [None]:
def load_csv(path: str, dtype=None) -> pd.DataFrame:
    full = os.path.join(CFG.data_dir, path)
    if not os.path.exists(full):
        print(f"[WARN] Missing file: {full}")
        return pd.DataFrame()
    df = pd.read_csv(full, dtype=dtype)
    return df

def load_all():
    dfs = {}
    for k, rel in CFG.files.items():
        dfs[k] = load_csv(rel)
    return dfs

In [None]:
def normalize_month_sector(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize month -> Timestamp month-end; sector -> int.
    Some CSVs might miss either column or encode differently; handle robustly.
    """
    df = df.copy()
    if CFG.month_col in df.columns:
        df[CFG.month_col] = df[CFG.month_col].astype(str).str.strip().apply(month_to_timestamp)
    else:
        # Try to split from 'id' pattern 'YYYY Mon_sector n'
        if CFG.id_col in df.columns:
            tmp = df[CFG.id_col].astype(str).str.split("_sector", n=1, expand=True)
            df[CFG.month_col] = tmp[0].str.strip().apply(month_to_timestamp)
            df[CFG.sector_col] = "sector " + tmp[1].str.strip()
    if CFG.sector_col in df.columns:
        df[CFG.sector_int_col] = df[CFG.sector_col].apply(parse_sector).astype("int16")
    else:
        # try parse from 'month' if it accidentally concatenated
        # e.g. '2019 Jan_sector 3' stored under month
        mask = df[CFG.month_col].astype(str).str.contains("sector", case=False, na=False)
        if mask.any():
            ss = df.loc[mask, CFG.month_col].astype(str)
            # split
            mm = ss.str.split("_sector", n=1, expand=True)
            df.loc[mask, CFG.month_col] = mm[0].apply(month_to_timestamp)
            df.loc[mask, CFG.sector_int_col] = mm[1].apply(parse_sector).astype("int16")
        else:
            df[CFG.sector_int_col] = -1
    return df