
# NZ → China Export Forecasting with PMI (Broader Sector Definition)

This notebook:
- Uses the SAME dataset
- Broadens HS sector mapping (Meat/Dairy/Fruit/Forestry)
- Adds diagnostics to confirm non-zero sector values
- Runs PMI ablation safely

Run top-to-bottom.


In [1]:
from pathlib import Path
import re
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# -------------------------------
# Paths & config
# -------------------------------
IN_DIR = Path("data")
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(exist_ok=True)

PMI_FILE = "MPI.csv"

# -------------------------------
# Broader sector mapping (FIX)
# -------------------------------
SECTOR_MAP = {
    "02": "Meat",
    "16": "Meat",
    "04": "Dairy",
    "19": "Dairy",
    "08": "Fruit",
    "20": "Fruit",
    "44": "Forestry",
    "47": "Forestry",
    "48": "Forestry",
}
SECTORS = ["Dairy", "Meat", "Fruit", "Forestry"]

def to_chapter(x):
    if pd.isna(x):
        return np.nan
    s = "".join(ch for ch in str(x) if ch.isdigit())
    return s[:2].zfill(2) if s else np.nan

def parse_month(x):
    s = str(x).strip()
    if re.fullmatch(r"\d{6}", s):
        return pd.to_datetime(s + "01", format="%Y%m%d", errors="coerce")
    return pd.to_datetime(x, errors="coerce")

# -------------------------------
# Load export data
# -------------------------------
files = [p for p in IN_DIR.glob("*.csv") if p.name.lower() != PMI_FILE.lower()]
if not files:
    raise FileNotFoundError(f"No export CSV files found in {IN_DIR.resolve()}")

raw = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

print("[INFO] Loaded export rows:", len(raw))
print("[INFO] Columns:", list(raw.columns))

# Parse month (FIXED: no to_timestamp('MS'))
raw["Month"] = raw["month"].apply(parse_month)
raw = raw.dropna(subset=["Month"])
raw["Month"] = pd.to_datetime(raw["Month"]).dt.to_period("M").dt.to_timestamp(how="start")

# China filter (robust)
raw["country"] = raw["country"].astype(str).str.strip()
china_mask = raw["country"].str.contains("china", case=False, na=False, regex=False)
df = raw[china_mask].copy()

# -------------------------------
# Diagnostics (IMPORTANT)
# -------------------------------
df["HS_Chapter"] = df["hs"].apply(to_chapter)

# Use total_export_FOB if present, otherwise fallback to Export_FOB
export_col = "total_export_FOB" if "total_export_FOB" in df.columns else "Export_FOB"
df["Total_Exports_NZD"] = pd.to_numeric(df[export_col], errors="coerce").fillna(0)

print("\n[DIAG] China rows after filter:", len(df))
print("[DIAG] Top HS chapters for China by value:")
print(
    df.groupby("HS_Chapter")["Total_Exports_NZD"]
      .sum()
      .sort_values(ascending=False)
      .head(20)
      .to_string()
)

print("\n[DIAG] Sector chapter totals (only mapped chapters):")
mapped_chapters = sorted(SECTOR_MAP.keys())
print(
    df[df["HS_Chapter"].isin(mapped_chapters)]
      .groupby("HS_Chapter")["Total_Exports_NZD"]
      .sum()
      .sort_values(ascending=False)
      .to_string()
)

# -------------------------------
# Aggregate to sectors
# -------------------------------
df["Sector"] = df["HS_Chapter"].map(SECTOR_MAP)
df = df[df["Sector"].isin(SECTORS)].copy()

long_df = df.groupby(["Month", "Sector"], as_index=False)["Total_Exports_NZD"].sum()
wide_df = long_df.pivot(index="Month", columns="Sector", values="Total_Exports_NZD").fillna(0).sort_index()

print("\n[CHECK] Sector non-zero months:")
for s in SECTORS:
    if s in wide_df.columns:
        print(f" - {s}: {(wide_df[s] > 0).sum()}/{len(wide_df)}  | total={wide_df[s].sum():,.0f}")
    else:
        print(f" - {s}: column missing (no mapped data)")

wide_df.to_csv(OUT_DIR / "omt_hs10_china_aggregated_wide.csv")
print(f"\n[OK] wrote WIDE: {(OUT_DIR / 'omt_hs10_china_aggregated_wide.csv').resolve()}")

# -------------------------------
# Load PMI (optional for now)
# -------------------------------
pmi_path = IN_DIR / PMI_FILE
if pmi_path.exists():
    pmi = pd.read_csv(pmi_path)
    pmi["date"] = pd.to_datetime(pmi.iloc[:, 0], errors="coerce")
    pmi["pmi"] = pd.to_numeric(pmi.iloc[:, 1], errors="coerce")
    pmi = pmi.dropna().set_index("date")["pmi"].resample("MS").mean().interpolate()

    print("\n[OK] PMI loaded:", len(pmi), "points")
    print("[OK] PMI range:", pmi.index.min().date(), "→", pmi.index.max().date())
else:
    print(f"\n[WARN] PMI file not found at {pmi_path.resolve()}. Skipping PMI load.")

print("\n[READY] Data prepared successfully with broader sector mapping.")


[INFO] Loaded export rows: 1299078
[INFO] Columns: ['month', 'hs', 'hs_desc', 'uom', 'country', 'Export_FOB', 'Export_Qty', 'Re_export_FOB', 'Re_export_Qty', 'total_export_FOB', 'total_export_qty', 'status']

[DIAG] China rows after filter: 35448
[DIAG] Top HS chapters for China by value:
HS_Chapter
40    32074608097
20    15629648718
44    15333062950
19     6265096496
30     3679793132
81     3476885107
35     2750839332
29     2137184533
50     1539815499
47     1370848635
21     1231445216
10     1093135822
80      985658432
51      869874786
23      708224178
41      497405764
90      357478620
27      346518464
22      300723813
84      238756423

[DIAG] Sector chapter totals (only mapped chapters):
HS_Chapter
20    15629648718
44    15333062950
19     6265096496
47     1370848635
48      155064329
16       11115124

[CHECK] Sector non-zero months:
 - Dairy: 60/60  | total=6,265,096,496
 - Meat: 43/60  | total=11,115,124
 - Fruit: 60/60  | total=15,629,648,718
 - Forestry: 60/60 