# s2pipe – local launcher (Step 2: preprocess)

This notebook is intended for running Step 2 preprocessing locally on the output of Step 1 (download), i.e. the Step‑1 `index.json` manifest.

What the notebook does:
- loads the Step‑1 manifest (`index.json`),
- runs the Step‑2 orchestrator (`run_preprocess`) for a limited number of pairs,
- verifies outputs: `x.tif`, `y.tif`, `angles.tif` (coarse grid, optional), `meta.json`, Step‑2 `index.json`, and the run manifest (`run=<id>.jsonl`).

Notes:
- Normalization is not implemented yet, so the configuration uses `normalize.mode="none"`.
- Angles are **not** appended into `x.tif`. When enabled, they are written to a separate `angles.tif` on the native coarse angle grid (typically ~23×23).


## 1) Environment setup and project discovery

The notebook assumes you run it from the `s2pipe` repository (or a subdirectory). If the package is not available, you can use `pip install -e .`.


In [None]:
from __future__ import annotations

import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
while (
    not (PROJECT_ROOT / "pyproject.toml").exists()
    and PROJECT_ROOT != PROJECT_ROOT.parent
):
    PROJECT_ROOT = PROJECT_ROOT.parent

print("PROJECT_ROOT:", PROJECT_ROOT)

# If imports fail, uncomment one of the following lines:
# !pip install -e .

# Alternative without installation (add src/ to sys.path):
src_dir = PROJECT_ROOT / "src"
if src_dir.exists() and str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
print("sys.path[0]:", sys.path[0])

## 2) Configure paths to Step‑1 index.json and the output directory

By default, this notebook looks for the fixture at `tests/fixtures/single_tile/meta/step1/index.json`.
If you use your own Step‑1 dataset, set `INDEX_PATH` manually.


In [None]:
from pathlib import Path

# Default path to the fixture
default_index = PROJECT_ROOT / "out" / "meta" / "step1" / "index.json"

# Or set it manually, e.g.:
# default_index = Path("/abs/path/to/your/step1/root/meta/manifest/index.json")

INDEX_PATH = default_index
print("INDEX_PATH:", INDEX_PATH)
assert INDEX_PATH.exists(), f"index.json does not exist: {INDEX_PATH}"

# DATA_ROOT = dataset root (where /meta/manifest/index.json resides)
DATA_ROOT = INDEX_PATH.parents[2]
print("DATA_ROOT:", DATA_ROOT)

# Step‑2 writes to DATA_ROOT/processed and DATA_ROOT/meta/step2
OUT_DIR = DATA_ROOT

# Run id for this run
RUN_ID = "local"

## 3) Inspect the Step‑1 index and select the first pair


In [None]:
from s2pipe.preprocess.inputs import load_download_index

index = load_download_index(INDEX_PATH)
print("pairs:", len(index.pairs))
assert len(index.pairs) > 0

pair0 = index.pairs[0]
print("tile:", pair0.tile_id)
print("sensing:", pair0.sensing_start_utc)

## 4) Step‑2 configuration

To keep the run lightweight, the default configuration is:
- `target_grid_ref = "scl_20m"`
- `l1c_bands = ("B01",)`
- normalization disabled
- angles enabled by default (toggle below)

Target grid may be chosen from reference grids (SCL or L1C bands).
The values of `target_grid_ref` may be `"scl_20m"` or one of the bands (`"B01"`, `"B02"`, etc.).
For real training you will typically want `target_grid_ref="scl_20m"` and more bands.
When angles are enabled, they are exported to **`angles.tif`** on the **coarse angle grid** (typically ~23×23), with channel layout `4 + 4*B` for `view_mode="per_band"`.


In [None]:
from s2pipe.preprocess.cfg import (
    PreprocessConfig,
    AngleAssetConfig,
    NormalizeConfig,
    LabelConfig,
)

# Toggle angles export by AngleAssetConfig.enabled (writes angles.tif on a coarse grid; does NOT increase x.tif size)
angles_cfg = AngleAssetConfig(
    enabled=True,
    include_sun=True,
    include_view=True,
    encode="sin_cos",
    view_mode="single",
    view_bands=(),
    detector_aggregate="nanmean",
    output_name="angles.tif",
)

cfg = PreprocessConfig(
    index_json=INDEX_PATH,
    out_dir=OUT_DIR,
    run_id=RUN_ID,
    max_pairs=1,
    target_grid_ref="scl_20m",
    l1c_bands=("B01",),
    angles=angles_cfg,
    labels=LabelConfig(),
    normalize=NormalizeConfig(mode="none"),
)

cfg

## 5) Run Step‑2 (`run_preprocess`)


In [None]:
from s2pipe.preprocess.run import run_preprocess

result = run_preprocess(cfg)
result

## 6) Verify outputs and perform a quick inspection


In [None]:
import json
import rasterio

assert result.step2_index_path is not None
step2_index = json.loads(Path(result.step2_index_path).read_text(encoding="utf-8"))
print("Step-2 samples:", len(step2_index.get("samples", [])))

# Find the record for the first pair
sample = None
for s in step2_index.get("samples", []):
    k = s.get("key", {})
    if (
        k.get("tile_id") == pair0.tile_id
        and k.get("sensing_start_utc") == pair0.sensing_start_utc
    ):
        sample = s
        break

assert sample is not None, "Sample not found in meta/step2/index.json"
print("status:", sample.get("status"))
print("paths:", sample.get("paths"))

x_path = OUT_DIR / sample["paths"]["x"]
y_path = OUT_DIR / sample["paths"]["y"]
m_path = OUT_DIR / sample["paths"]["meta"]
angles_path = (
    OUT_DIR / sample["paths"].get("angles", "") if "angles" in sample["paths"] else None
)

print("x:", x_path)
print("y:", y_path)
print("meta:", m_path)
print("angles:", angles_path)

assert x_path.exists() and y_path.exists() and m_path.exists()
if angles_path is not None:
    assert angles_path.exists()

meta = json.loads(m_path.read_text(encoding="utf-8"))
print("meta.schema:", meta.get("schema"))
print("meta.key:", meta.get("key"))
print("meta.channels.x:", meta.get("channels", {}).get("x"))
print("meta.channels.y:", meta.get("channels", {}).get("y"))

if angles_path is not None:
    # angles are stored separately (coarse grid)
    assert meta.get("paths", {}).get("angles") == "angles.tif"
    a_info = meta.get("angles", {})
    print("angles.channels:", len(a_info.get("channels", [])))
    print("angles.grid:", a_info.get("grid", {}))

    with rasterio.open(angles_path) as ds:
        a = ds.read()  # (C,H,W)
        print("angles.tif shape:", a.shape, "dtype:", a.dtype)
        assert ds.count == a.shape[0]
        assert ds.width <= 64 and ds.height <= 64  # typically ~23x23

### 6.1) Load and visualize one X channel and Y


In [None]:
import matplotlib.pyplot as plt

with rasterio.open(x_path) as ds:
    x = ds.read()  # (C,H,W)
    print("X shape:", x.shape, "dtype:", x.dtype)

with rasterio.open(y_path) as ds:
    y = ds.read(1)  # (H,W)
    print("Y shape:", y.shape, "dtype:", y.dtype)

plt.figure()
plt.title("X[0] (e.g., B01)")
plt.imshow(x[0], interpolation="nearest")
plt.colorbar()
plt.show()

plt.figure()
plt.title("Y (labels)")
plt.imshow(y, interpolation="nearest")
plt.colorbar()
plt.show()

if angles_path is not None and angles_path.exists():
    with rasterio.open(angles_path) as ds:
        a = ds.read()  # (C,Hc,Wc)
        print("Angles shape:", a.shape, "dtype:", a.dtype)

    # Visualize the first angles channel for sanity (typically sun_zen_sin)
    plt.figure()
    plt.title("Angles[0] (coarse grid)")
    plt.imshow(a[0], interpolation="nearest")
    plt.colorbar()
    plt.show()

## 7) Recommendations for a real run

A typical configuration for SCL-driven training (20 m) may look like this:

```python
cfg = make_preprocess_config(
    index_json=INDEX_PATH,
    out_dir=OUT_DIR,
    run_id="train20m",
    max_pairs=10,
    target_grid_ref="scl_20m",
    l1c_bands=("B02","B03","B04","B08","B11","B12"),
    angles=AngleAssetConfig(
        include_sun=True,
        include_view=True,
        encode="sin_cos",
        view_mode="per_band",
        view_bands=("B02","B03","B04","B08","B11","B12"),
        detector_aggregate="nanmean",
        output_name="angles.tif",
    ),
    labels=LabelConfig(),
    normalize=NormalizeConfig(mode="none"),
)
result = run_preprocess(cfg)
```

When angles are enabled, Step‑2 exports them as a separate *`angles.tif`* on the **coarse angle grid** (typically ~23×23), with `C = 4 + 4*B` channels for `view_mode="per_band"`.

For performance tuning: later you can add parallelization (`num_workers`) and normalization.
