In [9]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Stratified random sampling of hold-out test years:
- Sample 1 year per "decade block" (e.g., 1960-1969, 1970-1979, ...)
- Treat 2020-2024 as ONE final block (not 2020s decade)
- Python 3.9 compatible

Outputs:
- Printed mapping: block -> sampled year
- Printed TEST_YEARS (sorted list)
- Optional: write to .txt and .json
"""

import json
import random
from collections import OrderedDict


# =========================
# User config
# =========================
YEAR_START = 1960
YEAR_END   = 2024

RANDOM_SEED = 42

# Treat 2020-2024 as one block
LAST_BLOCK_START = 2020
LAST_BLOCK_END   = 2024

# Optional: exclude some years from being sampled (e.g., known problematic years)
EXCLUDE_YEARS = set()  # e.g., {1960, 1961}

# Optional outputs
WRITE_FILES = True
OUT_TXT  = f"stratified_test_years_{YEAR_START}_{YEAR_END}_seed{RANDOM_SEED}.txt"
OUT_JSON = f"stratified_test_years_{YEAR_START}_{YEAR_END}_seed{RANDOM_SEED}.json"


# =========================
# Core logic
# =========================
def build_year_blocks(year_start, year_end,
                      last_block_start=2020, last_block_end=2024,
                      exclude_years=None):
    exclude_years = set(exclude_years or [])
    blocks = {}

    for y in range(int(year_start), int(year_end) + 1):
        if y in exclude_years:
            continue

        if last_block_start <= y <= last_block_end:
            key = f"{last_block_start}-{last_block_end}"
        else:
            d0 = (y // 10) * 10
            d1 = d0 + 9
            key = f"{d0}-{d1}"

        blocks.setdefault(key, []).append(y)

    # Stable ordering by block start year
    def _block_start(k):
        return int(k.split("-")[0])

    blocks_sorted = OrderedDict(sorted(blocks.items(), key=lambda kv: _block_start(kv[0])))
    return blocks_sorted


def sample_one_year_per_block(blocks, seed=42):
    rng = random.Random(int(seed))
    sampled = OrderedDict()

    for blk, years in blocks.items():
        if not years:
            continue
        sampled[blk] = rng.choice(years)

    test_years = sorted(sampled.values())
    return sampled, test_years


def main():
    blocks = build_year_blocks(
        YEAR_START, YEAR_END,
        last_block_start=LAST_BLOCK_START,
        last_block_end=LAST_BLOCK_END,
        exclude_years=EXCLUDE_YEARS
    )

    sampled_map, test_years = sample_one_year_per_block(blocks, seed=RANDOM_SEED)

    print("=== Stratified hold-out test years (1 per block) ===")
    for blk, y in sampled_map.items():
        print(f"{blk}: {y}")

    print("\nTEST_YEARS =", test_years)

    if WRITE_FILES:
        with open(OUT_TXT, "w", encoding="utf-8") as f:
            f.write("Stratified hold-out test years (1 per block)\n")
            f.write(f"YEAR_START={YEAR_START}, YEAR_END={YEAR_END}, SEED={RANDOM_SEED}\n")
            f.write(f"LAST_BLOCK={LAST_BLOCK_START}-{LAST_BLOCK_END}\n")
            if EXCLUDE_YEARS:
                f.write(f"EXCLUDE_YEARS={sorted(EXCLUDE_YEARS)}\n")
            f.write("\nblock -> year\n")
            for blk, y in sampled_map.items():
                f.write(f"{blk}\t{y}\n")
            f.write("\nTEST_YEARS\n")
            f.write(",".join(map(str, test_years)) + "\n")

        with open(OUT_JSON, "w", encoding="utf-8") as f:
            json.dump(
                {
                    "year_start": YEAR_START,
                    "year_end": YEAR_END,
                    "seed": RANDOM_SEED,
                    "last_block": [LAST_BLOCK_START, LAST_BLOCK_END],
                    "exclude_years": sorted(EXCLUDE_YEARS),
                    "block_to_year": dict(sampled_map),
                    "test_years": test_years,
                },
                f,
                ensure_ascii=False,
                indent=2
            )

        print(f"\nWrote: {OUT_TXT}")
        print(f"Wrote: {OUT_JSON}")


if __name__ == "__main__":
    main()


=== Stratified hold-out test years (1 per block) ===
1960-1969: 1961
1970-1979: 1970
1980-1989: 1984
1990-1999: 1993
2000-2009: 2003
2010-2019: 2012
2020-2024: 2020

TEST_YEARS = [1961, 1970, 1984, 1993, 2003, 2012, 2020]

Wrote: stratified_test_years_1960_2024_seed42.txt
Wrote: stratified_test_years_1960_2024_seed42.json
