### Make Configs

First, we generate a config file with paths to all the raw data. Each entry of the resulting yaml file specifies the source satellite image and the shapefiles over which to create masks.

In [None]:
from pathlib import Path

data_dir = Path("/datadrive/glaciers/")
glaciers_file = data_dir / "vector_data/Glacier_2005.shp"
clean_g_file = data_dir / "vector_data/clean.shp"
debris_g_file =  data_dir / "vector_data/debris.shp"
border_file = data_dir / "vector_data/hkh.shp"
input_folder = data_dir / "unique_tiles"

In [None]:
SUBSET = 3 # set to False to run on full data

paths = {}
input_paths = list(Path(input_folder).iterdir())
if SUBSET is not False:
    input_paths = input_paths[:SUBSET]

for i, f in enumerate(input_paths):
    mask_ele = {}
    mask_ele["img_path"] = str(f)
    mask_ele["mask_paths"] = [str(s) for s in [glaciers_file, clean_g_file, debris_g_file]]
    mask_ele["border_path"] = str(border_file )
    paths[f"mask_{i}"] = mask_ele

In [None]:
import yaml

conf_file = data_dir / "conf/masking_paths.yaml"
with open(conf_file, 'w') as f:
    yaml.dump(paths, f, default_flow_style=False, sort_keys=False)

### Masking

Given the configuration file `masking_paths.yaml`, we can create numpy masks that are aligned with the underlying numpy images.

In [None]:
from glacier_mapping.data.mask import generate_masks
import shutil

masking_paths = yaml.safe_load(open(conf_file))
img_paths = [p["img_path"] for p in masking_paths.values()]
mask_paths = [p["mask_paths"] for p in masking_paths.values()]
border_paths = [p["border_path"] for p in masking_paths.values()]
out_dir = Path(data_dir / "processed_exper/masks")

if out_dir.exists():
    shutil.rmtree(out_dir)
    
generate_masks(img_paths, mask_paths, border_paths=border_paths, out_dir=out_dir)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

mask_dir = data_dir/ "processed_exper/masks"
mask = np.load(mask_dir / "mask_01.npy")
border = np.load(mask_dir / "border_01.npy")
_, ax = plt.subplots(1, 4, figsize=(15, 15))
ax[0].imshow(mask[:, :, 0])
ax[1].imshow(mask[:, :, 1])
ax[2].imshow(mask[:, :, 2])
ax[3].imshow(border)

mask_df = pd.read_csv(mask_dir / "mask_metadata.csv")
mask_df.head()

### Patches

Once we have binary masks associated with each image, we can slice them into 512 x 512 patches to be used for training.

In [None]:
from glacier_mapping.data.slice import write_pair_slices

processed_dir = data_dir / "processed_exper"
paths = pd.read_csv(processed_dir / "masks" / "mask_metadata.csv")
output_dir = processed_dir / "patches"
output_dir.mkdir(parents=True, exist_ok=True)

metadata = []
for row in range(len(paths)):
    print(f"## Slicing tiff {row +1}/{len(paths)} ...")
    metadata_ = write_pair_slices(
        paths.iloc[row]["img"],
        paths.iloc[row]["mask"],
        output_dir,
        border_path=paths.iloc[row]["border"],
        out_base=f"patch_{paths.index[row]}"
    )
    metadata.append(metadata_)

metadata = pd.concat(metadata, axis=0)
out_path = Path(output_dir, "patches.geojson")
metadata.to_file(out_path, index=False, driver="GeoJSON")

In [None]:
from glacier_mapping.data.slice import plot_slices
plot_slices(processed_dir / "patches", n_cols=4, div=300)

### Split and filter

Now that we have sliced each tiff into small patches, we can determine which to use for training, validation, and testing. We first filter away those patches that have relatively little glacier, then we randomly shuffle them into train, dev, and test directories.

In [None]:
from addict import Dict
import geopandas as gpd
import glacier_mapping.data.process_slices_funs as pf
import yaml

processed_dir = data_dir / "processed_exper"
output_dir = processed_dir / "patches"

pconf = Dict(yaml.safe_load(open(data_dir / "conf/postprocess.yaml", "r")))
slice_meta = gpd.read_file(output_dir / "patches.geojson")

# filter all the slices to the ones that matter
print("filtering")
keep_ids = pf.filter_directory(
    slice_meta,
    filter_perc=pconf.filter_percentage,
    filter_channel=pconf.filter_channel,
)

# validation: get ids for the ones that will be training vs. testing.
print("reshuffling")
split_method = [item for item in pconf.split_method.items()][0][0]
split_ratio = pconf.split_method[split_method].split_ratio
split_fun = getattr(pf, split_method)
split_ids = split_fun(keep_ids, split_ratio, slice_meta=slice_meta)
target_locs = pf.reshuffle(split_ids, processed_dir)


For future reference, it's useful to save which patches went into which split. These will be contained in the `target_locks.pickle` file below.

In [None]:
# save target_locs
import pickle
target_locs_file = processed_dir / "target_locs.pickle"
with open(target_locs_file, "wb") as f:
    pickle.dump(target_locs, f, protocol=pickle.HIGHEST_PROTOCOL)
