In [75]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Install InstaGeo

In [1]:
# Clone the InstaGeo-E2E-Geospatial-ML repository from GitHub
repository_url = "https://github.com/instadeepai/InstaGeo-E2E-Geospatial-ML"
!git clone {repository_url}

Cloning into 'InstaGeo-E2E-Geospatial-ML'...
remote: Enumerating objects: 374, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 374 (delta 127), reused 116 (delta 67), pack-reused 162 (from 1)[K
Receiving objects: 100% (374/374), 1.43 MiB | 3.29 MiB/s, done.
Resolving deltas: 100% (194/194), done.


In [2]:
%%bash
cd InstaGeo-E2E-Geospatial-ML
git pull

Already up to date.


In [79]:
%%capture
%%bash
# Navigate to the cloned InstaGeo-E2E-Geospatial-ML directory
cd /kaggle/working/InstaGeo-E2E-Geospatial-ML
# Stash any local changes to avoid conflicts when switching branches
git stash
#Switch to the 'geo-ai-hack' branch, which likely contains specific code for the Geo AI Hackathon
git checkout geo-ai-hack
# Install the InstaGeo package 
pip install -e .[all]

In [40]:
# Import necessary libraries
import os
import re
import shutil
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
from pyproj import CRS, Transformer
import rasterio
os.environ["HYDRA_FULL_ERROR"] ="1"

In [41]:
from rasterio.transform import from_origin
from tqdm import tqdm
import torch 

def compute_ndvi(red_band, nir_band):
    """Compute NDVI using PyTorch tensors."""
    ndvi = (nir_band - red_band) / (nir_band + red_band + 1e-6)  # Avoid division by zero
    return torch.clamp(ndvi, min=-1.0, max=1.0)



def compute_mean_std_from_csv(csv_path):
    """
    Compute mean and standard deviation for the 7 spectral bands across all temporal dimensions (total 21 bands),
    using .tif file paths provided in the CSV file under the 'Input' column.

    Args:
        csv_path (str): Path to CSV file containing the list of .tif files.

    Returns:
        tuple: Mean and standard deviation for the 7 bands across 3 temporal slices.
    """
    sum_pixels = None
    sum_squared_pixels = None
    total_pixels = 0

    # Read CSV file
    df = pd.read_csv(csv_path)

    # Ensure the CSV has an "Input" column
    if "Input" not in df.columns:
        raise ValueError("CSV file must contain a column named 'Input' with .tif file paths.")

    file_paths = df["Input"].tolist()  # Extract file paths

    for file_path in tqdm(file_paths):
        if not os.path.exists(file_path):
            print(f"Skipping {file_path}: File not found.")
            continue

        with rasterio.open(file_path) as src:
            img = src.read()  # Shape: (21, H, W)
            img_tensor = torch.tensor(img, dtype=torch.float32)

            if img_tensor.shape[0] != 21:
                print(f"Skipping {file_path}: Expected 21 bands, found {img_tensor.shape[0]}")
                continue

            # Extract only the 7 bands from each temporal sequence
            selected_bands = torch.cat([img_tensor[:7], img_tensor[7:14], img_tensor[14:21]])

            if sum_pixels is None:
                sum_pixels = torch.zeros(7, dtype=torch.float32)
                sum_squared_pixels = torch.zeros(7, dtype=torch.float32)

            sum_pixels += selected_bands[:7].mean(dim=(1, 2))  # Compute mean across spatial dimensions
            sum_squared_pixels += (selected_bands[:7] ** 2).mean(dim=(1, 2))  # Compute squared mean
            total_pixels += 1

    if total_pixels == 0:
        raise ValueError("No valid files found in the provided CSV. Check file paths.")

    mean = sum_pixels / total_pixels
    std = torch.sqrt(sum_squared_pixels / total_pixels - mean ** 2)

    print("Computed Mean:", mean.tolist())
    print("Computed Std:", std.tolist())
    return mean.tolist(), std.tolist()

def add_ndvi_in_chips(input_dir, output_dir):
    """
    Process Sentinel-Landsat harmonized chips, compute NDVI, and save 21-band .tif files.
    
    Args:
        input_dir (str): Path to input directory containing .tif chips.
        output_dir (str): Path to save processed .tif files.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for file in tqdm(os.listdir(input_dir)):
        if not file.endswith(".tif"):
            continue
        
        input_path = os.path.join(input_dir, file)
        output_path = os.path.join(output_dir, file)
        
        with rasterio.open(input_path) as src:
            img = src.read()  # Shape: (18, H, W)
            meta = src.meta.copy()
            
            # Ensure we have exactly 18 bands
            if img.shape[0] != 18:
                print(f"Skipping {file}: Expected 18 bands, found {img.shape[0]}")
                continue
            
            H, W = img.shape[1], img.shape[2]
            img_tensor = torch.tensor(img, dtype=torch.float32)
            
            # Compute NDVI for each temporal slice (Bands 3=Red, Bands 4=NIR)
            ndvi_1 = compute_ndvi(img_tensor[3], img_tensor[4])
            ndvi_2 = compute_ndvi(img_tensor[9], img_tensor[10])
            ndvi_3 = compute_ndvi(img_tensor[15], img_tensor[16])
            
            # Ensure NDVI values are stored as float32
            ndvi_1 = ndvi_1.to(dtype=torch.float32)
            ndvi_2 = ndvi_2.to(dtype=torch.float32)
            ndvi_3 = ndvi_3.to(dtype=torch.float32)
            
            # Create new tensor with 21 bands
            img_new = torch.empty((21, H, W), dtype=torch.float32)
            img_new[:7] = torch.cat([img_tensor[:6], ndvi_1.unsqueeze(0)])  # First temporal
            img_new[7:14] = torch.cat([img_tensor[6:12], ndvi_2.unsqueeze(0)])  # Second temporal
            img_new[14:21] = torch.cat([img_tensor[12:18], ndvi_3.unsqueeze(0)])  # Third temporal
            
            # Convert back to numpy for saving
            img_new_np = img_new.numpy()
            meta.update({"count": 21, "dtype": 'float32'})
            
            with rasterio.open(output_path, "w", **meta) as dst:
                dst.write(img_new_np.astype(np.float32))



def filter_nodata_labels(input_csv, output_csv, remove_nan_observation=False, nodata_value=-9999):
    """
    Filters out rows where the corresponding label raster contains only -1 values.
    Optionally replaces all -1 values with a specified NoData value.

    Args:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to save the filtered CSV file.
        remove_nan_observation (bool): If True, replaces -1 values in the label raster with NoData.
        nodata_value (int, optional): Value to use as NoData when replacing -1. Default is -9999.

    Returns:
        None (Saves the filtered CSV and prints stats).
    """
    df = pd.read_csv(input_csv)

    # Ensure the CSV has "Input" and "Label" columns
    if "Input" not in df.columns or "Label" not in df.columns:
        raise ValueError("CSV file must contain 'Input' and 'Label' columns with file paths.")

    original_count = len(df)
    nan_observations = 0 
    valid_rows = []

    for _, row in tqdm(df.iterrows(), total=original_count):
        label_path = row["Label"]
        
        if not os.path.exists(label_path):
            print(f"Skipping {label_path}: File not found.")
            continue
        
        with rasterio.open(label_path) as src:
            label_data = src.read(1)  # Assuming label is a single-band raster
            meta = src.meta.copy()

            unique_values = set(label_data.flatten())

            # Retain only if label contains at least one 0 or 1
            if 0 in unique_values or 1 in unique_values:
                valid_rows.append(row)

                # If remove_nan_observation is enabled, replace -1 with NoData
                if remove_nan_observation:
                    label_data[label_data == -1] = nodata_value
                    meta.update({"nodata": nodata_value})

                    # Save the modified label raster
                    with rasterio.open(label_path, "w", **meta) as dst:
                        dst.write(label_data, 1)
                    nan_observations+=1
                    
    filtered_count = len(valid_rows)
    removed_count = original_count - filtered_count

    # Save new CSV with valid rows
    new_df = pd.DataFrame(valid_rows)
    new_df.to_csv(output_csv, index=False)

    # Print statistics
    print(f"Total rows in original CSV: {original_count}")
    print(f"Rows removed (only contained -1 values): {removed_count}")
    print(f"Rows remaining after filtering: {filtered_count}")
    if remove_nan_observation:
        print(f"Nan observation removed: {nan_observations}")

    return filtered_count, removed_count

                

In [29]:
add_ndvi_in_chips("hls_train/hls_train/chips","hls_ndvi_train/chips")

100%|██████████████████████████████████████████████████████████| 10429/10429 [02:52<00:00, 60.55it/s]


In [30]:
add_ndvi_in_chips("hls_test/hls_test/chips","hls_ndvi_test/chips")

100%|████████████████████████████████████████████████████████████| 2405/2405 [01:02<00:00, 38.76it/s]


In [42]:
def generate_label_mapping(root_dir, input_subdir, output_csv):
    """
    Generate a CSV mapping input chips to corresponding segmentation maps.

    Args:
        root_dir (str or Path): Root directory containing the subdirectories for chips and segmentation maps.
        input_subdir (str): Subdirectory path for chips within the root directory.
        output_csv (str or Path): Output path for the generated CSV file.
    """
    root_dir = Path(root_dir)
    chips_orig = os.listdir(root_dir / input_subdir / "chips")
    if os.path.exists(root_dir / input_subdir / "seg_maps"):
        add_label = True
    else:
        add_label = False

    chips = [chip.replace("chip", f"{input_subdir}/chips/chip") for chip in chips_orig]

    if add_label:
        seg_maps = [chip.replace("chip", f"{input_subdir}/seg_maps/seg_map") for chip in chips_orig]
        df = pd.DataFrame({"Input": chips, "Label": seg_maps})
    else:
        df = pd.DataFrame({"Input": chips})
    df.to_csv(output_csv, index=False)
    
    print(f"Number of rows is: {df.shape[0]}")
    print(f"CSV generated and saved to: {output_csv}")

In [43]:
# set data folder path
import os 

current_path= os.getcwd() 

print(current_path)
# input_dir= os.path.join(current_path,"/kaggle/input/geo-ai-hack")
input_dir = current_path 

/home/krschap/foss/geoaihack2024/geo-ai-hack


In [16]:
# Generate label mappings for the training and testing datasets
generate_label_mapping(input_dir, 'hls_ndvi_train', "train_ndvi_ds.csv")
generate_label_mapping(input_dir, 'hls_ndvi_test', "test_ndvi_ds.csv")

Number of rows is: 10429
CSV generated and saved to: train_ndvi_ds.csv
Number of rows is: 2404
CSV generated and saved to: test_ndvi_ds.csv


In [38]:
filter_nodata_labels("train_ndvi_ds.csv","train_ndvi_ds_filter.csv")

 56%|████████████████████████████████▌                         | 5862/10429 [00:17<00:12, 358.56it/s]

Skipping hls_ndvi_train/seg_maps/seg_map_20200501_S30_T37NCC_2020094T073611_5_3.tif.aux.xml: File not found.


100%|█████████████████████████████████████████████████████████| 10429/10429 [00:30<00:00, 343.55it/s]

Total rows in original CSV: 10429
Rows removed (only contained -1 values): 1
Rows remaining after filtering: 10428





(10428, 1)

### Validation Set

In [44]:
def split_validation_data(mapping_csv, validation_split=0.3):
    """
    Split data into training and validation sets based on a CSV file mapping `chips` and `seg_maps`.

    Args:
        mapping_csv (str or Path): Path to the CSV file containing the mapping between `chips` and `seg_maps`.
        data_dir (str or Path): Path to the merged directory containing all files.
        validation_dir (str or Path): Path to the new directory for validation files.
        validation_split (float): Fraction of the data to use as the validation set.
    """
    df = pd.read_csv(mapping_csv)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    num_val = int(len(df) * validation_split)
    train_df = df[num_val:]
    val_df = df[:num_val]
    train_df.to_csv("train_ndvi_split.csv",index=False)    
    print(f"CSV train split  saved to: train_ndvi_split.csv")
    val_df.to_csv("validation_ndvi_split.csv",index=False)    
    print(f"CSV validation split  saved to: validation_split.csv")
    
    return 
    

In [20]:
# Split the training dataset into training and validation sets
split_validation_data(
    mapping_csv="train_ndvi_ds.csv",
    validation_split=0.3
)

CSV train split  saved to: train_split.csv
CSV validation split  saved to: validation_split.csv


## InstaGeo - Model

After creating our training and validation splits, we can move on to fine-tuning a model that includes a Prithvi backbone paired with a classification head. For regression tasks, the classification head can easily be replaced with a suitable regression head. Additionally, if a completely different model architecture is needed, it can be designed and implemented within this framework.

In [45]:
def load_yml(filepath):
    """Load data from a YAML file.

    Args:
        filepath (str | Path): The path to the YAML file.

    Returns:
        Dict: The loaded data, or None if the file does not exist.
    """
    filepath=Path(filepath)
    with filepath.open() as f:
        return yaml.safe_load(f)
        
def save_yml(data,filepath):
    """Save data to a YAML file.

    Args:
        data (Dict): The data to save.
        filepath (str | Path): The file path to save the YAML to.
    """
    filepath = Path(filepath)
    with filepath.open("w") as f:
        yaml.dump(data, f)
    print(f"Data saved to {filepath}.")


### Launch Training


First, compute the mean and standard deviation for the dataset using the InstaGeo command. Then update the corresponding configuration file [locust.yaml](https://github.com/instadeepai/InstaGeo-E2E-Geospatial-ML/blob/main/instageo/model/configs/locust.yaml). In this case, it has already been done for you. However, if you change the dataset split or modify the training data, you should run the command again to compute the new mean and standard deviation.

In [None]:
compute_mean_std

In [22]:
%%bash
python -m instageo.model.run --config-name=locust-ndvi \
    root_dir="." \
    train.batch_size=8 \
    train.num_epochs=5 \
    mode=stats \
    train_filepath="train_ndvi_ds.csv" \

Seed set to 1042


[2025-02-04 23:34:12,035][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-04 23:34:12,037][__main__][INFO] - Imported hydra config:
checkpoint_path: null
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
  - 9
  - 10
  - 11
  - 12
  - 13
- 14
  - 15
  - 16
  - 17
  - 18
  - 19
  - 20
  constant_multiplier: 1.0
  img_size: 256
  mean:
- 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
625 2862.867431640
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
 -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
  - 2372.204345703125
398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
5 1267.797485351562
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
mode: stats
model:
freeze_backbone: false
  num_classes: 2
output_dir: null
root_dir: .
std

Error executing job with overrides: ['root_dir=.', 'train.batch_size=8', 'train.num_epochs=5', 'mode=stats', 'train_filepath=train_ndvi_ds.csv']
Traceback (most recent call last):
y>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
rschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py", line 749, in <module>
    main()
 line 94, in decorated_maingeoaihack2024/env/lib/python3.12/site-packages/hydra/main.py",
    _run_hydra(
.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
    _run_app(
chap/foss/geoaihack2024/env/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
    run_and_report(
hydra/_internal/utils.py", line 223, in run_and_reportn3.12/site-packages/
    raise ex
hack2024/env/lib/python3.12/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
 return func()
           ^^^^^^
ckages/hydra/_internal/utils.py", line 458, in <lambda>3.12/site-pa
    lambda: 

CalledProcessError: Command 'b'python -m instageo.model.run --config-name=locust-ndvi \\\n    root_dir="." \\\n    train.batch_size=8 \\\n    train.num_epochs=5 \\\n    mode=stats \\\n    train_filepath="train_ndvi_ds.csv" \\\n'' returned non-zero exit status 1.

In [31]:
compute_mean_std_from_csv("train_ndvi_ds.csv")

 56%|████████████████████████████████▉                          | 5831/10429 [01:41<01:19, 58.11it/s]

Skipping hls_ndvi_train/chips/chip_20200501_S30_T37NCC_2020094T073611_5_3.tif.aux.xml: File not found.


100%|██████████████████████████████████████████████████████████| 10429/10429 [03:01<00:00, 57.53it/s]

Computed Mean: [695.5580444335938, 1293.4241943359375, 1792.435546875, 2443.89111328125, 2880.999267578125, 2321.597900390625, 0.062481388449668884]
Computed Std: [3357.875244140625, 3497.693359375, 3630.5771484375, 3794.192138671875, 3892.0703125, 3700.383544921875, 0.11115093529224396]





([695.5580444335938,
  1293.4241943359375,
  1792.435546875,
  2443.89111328125,
  2880.999267578125,
  2321.597900390625,
  0.062481388449668884],
 [3357.875244140625,
  3497.693359375,
  3630.5771484375,
  3794.192138671875,
  3892.0703125,
  3700.383544921875,
  0.11115093529224396])

In [46]:
# Updat locust file
# Load the Locust model configuration file
locust_cfg_path="InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust-ndvi.yaml"
# Load the YAML configuration into a dictionary
locust_cfg=load_yml(locust_cfg_path)
# Update the mean and standard deviation values in the configuration
locust_cfg["mean"]=[670.5441284179688, 1267.7974853515625, 1772.599365234375, 2415.69091796875, 2879.2431640625, 2337.822509765625,0.062481388449668884]
locust_cfg["std"]=[2146.305419921875, 2203.416259765625, 2247.03515625, 2310.74755859375, 2322.708984375, 2211.968505859375,0.11115093529224396]
# Save the updated configuration back to the YAML file
save_yml(locust_cfg,locust_cfg_path)

Data saved to InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust-ndvi.yaml.


In [47]:
import os 
cwd = os.getcwd() 
print(cwd)

/home/krschap/foss/geoaihack2024/geo-ai-hack


In [21]:
import torch

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
	
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")


Is CUDA supported by this system? True
CUDA version: 12.4
ID of current CUDA device:0
Name of current CUDA device:NVIDIA GeForce RTX 4090 Laptop GPU


In [54]:
# Train the InstaGeo model using the Locust configuration
!python -m instageo.model.run  --config-name=locust-ndvi \
    hydra.run.dir=ndvi \
    root_dir="." \
    train.batch_size=10 \
    train.num_epochs=20 \
    mode=train \
    train_filepath="train_ndvi_split.csv" \
    valid_filepath="validation_ndvi_split.csv"

Seed set to 1042
[2025-02-05 08:28:10,197][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 08:28:10,198][__main__][INFO] - Imported hydra config:
checkpoint_path: null
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
  - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  - 18
  - 19
  - 20
  constant_multiplier: 1.0
  img_size: 256
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
  - 2862.867431640625
  - 2357.759765625
  - 0.062481388449668884
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
  - 2372.204345703125
  - 2398.52685546875
  - 2292.96435546875
  - 0.11115093529224396
  temporal_dim: 3
mean:
- 670.5441284179688
- 1267.7974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
- 0.06248

### Run Model Evaluation
 To evaluate the model, adjust the `checkpoint_path` argument to point to the desired model checkpoint. The checkpoint file is typically located in the `hydra.run.dir` directory and is named `instageo_best_checkpoint.ckpt`.
For example:
```
/kaggle/working/outputs/first_run/instageo_best_checkpoint.ckpt 
```
Make sure to provide the correct path to the checkpoint file based on your training output directory.

In [5]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="." \
    test_filepath="validation_split.csv" \
    train.batch_size=16 \
    checkpoint_path='cwd/instageo_best_checkpoint-v1.ckpt' \
    mode=eval

Seed set to 1042


[2025-02-04 18:48:40,713][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-04 18:48:40,714][__main__][INFO] - Imported hydra config:
t_checkpoint-v1.ckpt/instageo_bes
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
9 - 
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
  img_size: 256
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
2.867431640625
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
- -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
03125372.2043457
  - 2398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
mode: eval
model:
  freeze_backbone: false
  num_classes: 2
output_dir: null
root_dir: .
std:
875146.305419

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/krschap/foss/geoaihack2024/env/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 196/196 [01:03<00:00,  3.08it/s]
━━━━━━━━━━━━━━━━━━━━━━━┓━━━━┳━━━━
 metric       [0m[1m [0m┃[1m [0m[1m      DataLoader 0       [0m[1m [0m┃
━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━���━━━━━━━━━━━━━━━━━┩
[0m[36m [0m│[35m [0m[35m   0.8265830278396606    [0m[35m [0m│
t_Acc_1_epoch     [0m[36m [0m│[35m [0m[35m   0.6637557744979858    [0m[35m [0m│
m [0m[36m    test_IoU_0_epoch     [0m[36m [0m│[35m [0m[35m   0.6095579862594604    [0m[35m [0m│
0289001    [0m[35m [0m│oU_1_epoch     [0m[36m [0m│[35m [0m[35m   0.572863996
[35m   0.6977682709693909    [0m[35m [0m│m[36m [0m│[35m [0m
[0m│[35m [0m[35m   0.7771565318107605    [0m[35m [0m│
epoch   [0m[36m [0m│[35m [0m[35m   0.8265830278396606    [0m[35m [0m│
m   test_Recall_1_epoch   [0m[36m [0m│[35m [0m[35m   0.6637557744979858    [0m[35m [0m│
 [0m[35m [0m│     test_aAcc_epoch     [0m[36m [0m│[35m [0m[35m   0.7456339597702026   
1127

## Make Submission

We first run inference on test chips to get the predictions

In [8]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="." \
    test_filepath="test_ds.csv" \
    train.batch_size=16 \
    checkpoint_path='cwd/instageo_best_checkpoint-v1.ckpt' \
    output_dir='cwd/predictions' \
    mode=chip_inference

Seed set to 1042


[2025-02-04 18:53:25,152][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-04 18:53:25,154][__main__][INFO] - Imported hydra config:
t_checkpoint-v1.ckpt/instageo_bes
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
9 - 
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
  img_size: 256
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
2.867431640625
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
- -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
03125372.2043457
  - 2398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
_inference
model:
  freeze_backbone: false
  num_classes: 2
output_dir: cwd/predictions
root_dir: .
std:
- 

Running Inference: 100%|██████████| 151/151 [00:58<00:00,  2.57it/s]


After getting the prdictions for each chip, we retrieve the predicted value for each observatio in our test split.

In [6]:
import os 
predictions_directory = "cwd/predictions"
prediction_files = os.listdir(predictions_directory)

def get_prediction_value(row):
    matching_files = [f for f in prediction_files if (str(row['date']) in f) and (row['mgrs_tile_id'] in f)]
    if not matching_files:
        return (np.nan, np.nan)
    for file in matching_files:
        with rasterio.open(f"{predictions_directory}/{file}") as src:
            width, height = src.width, src.height
            affine_transform = rasterio.transform.AffineTransformer(src.transform)
            transformer = Transformer.from_crs(CRS.from_epsg(4326), src.crs, always_xy=True)
            x_chip, y_chip = transformer.transform(row['x'], row['y'])
            x_offset, y_offset = affine_transform.rowcol(x_chip, y_chip)
            
            if 0 <= x_offset < width and 0 <= y_offset < height:
                return src.read(1)[y_offset, x_offset], file
    return (np.nan, np.nan)

In [22]:
!pip install numpy 



In [2]:
import pandas as pd 

In [8]:
submission_df = pd.read_csv("test.csv")

print(submission_df)

submission_df[['prediction', 'filename']] = submission_df.apply(get_prediction_value, axis=1, result_type='expand')
submission_df[["id","prediction"]].to_csv("hls_submission.csv",index=False)

           id          x          y      date mgrs_tile_id
0        ID_0  43.331667   7.276944  20210501        38NLP
1        ID_1  44.902500  15.438056  20210601        38PMC
2        ID_2  44.429167  16.330833  20210601        38QMD
3        ID_3  65.844167  29.278611  20210201        41RQN
4        ID_4  38.735556  17.148056  20210101        37QDU
...       ...        ...        ...       ...          ...
3213  ID_3213  45.377500   7.028333  20211001        38NNN
3214  ID_3214  43.135833  15.619444  20210201        38PLC
3215  ID_3215  57.328611  25.895278  20210101        40REP
3216  ID_3216  62.626667  25.923611  20210101        41RMJ
3217  ID_3217  47.930278  14.035556  20210301        38PRA

[3218 rows x 5 columns]
