In [75]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Install InstaGeo

In [1]:
# Clone the InstaGeo-E2E-Geospatial-ML repository from GitHub
repository_url = "https://github.com/instadeepai/InstaGeo-E2E-Geospatial-ML"
!git clone {repository_url}

Cloning into 'InstaGeo-E2E-Geospatial-ML'...
remote: Enumerating objects: 374, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 374 (delta 127), reused 116 (delta 67), pack-reused 162 (from 1)[K
Receiving objects: 100% (374/374), 1.43 MiB | 3.29 MiB/s, done.
Resolving deltas: 100% (194/194), done.


In [2]:
%%bash
cd InstaGeo-E2E-Geospatial-ML
git pull

Already up to date.


In [79]:
%%capture
%%bash
# Navigate to the cloned InstaGeo-E2E-Geospatial-ML directory
cd /kaggle/working/InstaGeo-E2E-Geospatial-ML
# Stash any local changes to avoid conflicts when switching branches
git stash
#Switch to the 'geo-ai-hack' branch, which likely contains specific code for the Geo AI Hackathon
git checkout geo-ai-hack
# Install the InstaGeo package 
pip install -e .[all]

In [9]:
# Import necessary libraries
import os
import re
import shutil
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
from pyproj import CRS, Transformer
import rasterio
os.environ["HYDRA_FULL_ERROR"] ="1"

In [14]:
def generate_label_mapping(root_dir, input_subdir, output_csv):
    """
    Generate a CSV mapping input chips to corresponding segmentation maps.

    Args:
        root_dir (str or Path): Root directory containing the subdirectories for chips and segmentation maps.
        input_subdir (str): Subdirectory path for chips within the root directory.
        output_csv (str or Path): Output path for the generated CSV file.
    """
    root_dir = Path(root_dir)
    chips_orig = os.listdir(root_dir / input_subdir / "chips")
    if os.path.exists(root_dir / input_subdir / "seg_maps"):
        add_label = True
    else:
        add_label = False

    chips = [chip.replace("chip", f"{input_subdir}/chips/chip") for chip in chips_orig]

    if add_label:
        seg_maps = [chip.replace("chip", f"{input_subdir}/seg_maps/seg_map") for chip in chips_orig]
        df = pd.DataFrame({"Input": chips, "Label": seg_maps})
    else:
        df = pd.DataFrame({"Input": chips})
    df.to_csv(output_csv, index=False)
    
    print(f"Number of rows is: {df.shape[0]}")
    print(f"CSV generated and saved to: {output_csv}")

In [15]:
# set data folder path
import os 

current_path= os.getcwd() 

print(current_path)
# input_dir= os.path.join(current_path,"/kaggle/input/geo-ai-hack")
input_dir = current_path 

/home/krschap/foss/geoaihack2024/geo-ai-hack


In [16]:
# Generate label mappings for the training and testing datasets
generate_label_mapping(input_dir, 'hls_train/hls_train', "train_ds.csv")
generate_label_mapping(input_dir, 'hls_test/hls_test', "test_ds.csv")

Number of rows is: 10429
CSV generated and saved to: train_ds.csv
Number of rows is: 2405
CSV generated and saved to: test_ds.csv


### Validation Set

In [10]:
def split_validation_data(mapping_csv, validation_split=0.3):
    """
    Split data into training and validation sets based on a CSV file mapping `chips` and `seg_maps`.

    Args:
        mapping_csv (str or Path): Path to the CSV file containing the mapping between `chips` and `seg_maps`.
        data_dir (str or Path): Path to the merged directory containing all files.
        validation_dir (str or Path): Path to the new directory for validation files.
        validation_split (float): Fraction of the data to use as the validation set.
    """
    df = pd.read_csv(mapping_csv)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    num_val = int(len(df) * validation_split)
    train_df = df[num_val:]
    val_df = df[:num_val]
    train_df.to_csv("train_split.csv",index=False)    
    print(f"CSV train split  saved to: train_split.csv")
    val_df.to_csv("validation_split.csv",index=False)    
    print(f"CSV validation split  saved to: validation_split.csv")
    
    return 
    

In [11]:
# Split the training dataset into training and validation sets
split_validation_data(
    mapping_csv="train_ds.csv",
    validation_split=0.3
)

CSV train split  saved to: train_split.csv
CSV validation split  saved to: validation_split.csv


## InstaGeo - Model

After creating our training and validation splits, we can move on to fine-tuning a model that includes a Prithvi backbone paired with a classification head. For regression tasks, the classification head can easily be replaced with a suitable regression head. Additionally, if a completely different model architecture is needed, it can be designed and implemented within this framework.

In [19]:
def load_yml(filepath):
    """Load data from a YAML file.

    Args:
        filepath (str | Path): The path to the YAML file.

    Returns:
        Dict: The loaded data, or None if the file does not exist.
    """
    filepath=Path(filepath)
    with filepath.open() as f:
        return yaml.safe_load(f)
        
def save_yml(data,filepath):
    """Save data to a YAML file.

    Args:
        data (Dict): The data to save.
        filepath (str | Path): The file path to save the YAML to.
    """
    filepath = Path(filepath)
    with filepath.open("w") as f:
        yaml.dump(data, f)
    print(f"Data saved to {filepath}.")


### Launch Training


First, compute the mean and standard deviation for the dataset using the InstaGeo command. Then update the corresponding configuration file [locust.yaml](https://github.com/instadeepai/InstaGeo-E2E-Geospatial-ML/blob/main/instageo/model/configs/locust.yaml). In this case, it has already been done for you. However, if you change the dataset split or modify the training data, you should run the command again to compute the new mean and standard deviation.

In [20]:
# %%bash
# python -m instageo.model.run --config-name=locust \
#     root_dir="/kaggle/input/geo-ai-hack" \
#     train.batch_size=8 \
#     train.num_epochs=5 \
#     mode=stats \
#     train_filepath="train_ds.csv" \

In [21]:
# Updat locust file
# Load the Locust model configuration file
locust_cfg_path="InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust.yaml"
# Load the YAML configuration into a dictionary
locust_cfg=load_yml(locust_cfg_path)
# Update the mean and standard deviation values in the configuration
locust_cfg["mean"]=[670.5441284179688, 1267.7974853515625, 1772.599365234375, 2415.69091796875, 2879.2431640625, 2337.822509765625]
locust_cfg["std"]=[2146.305419921875, 2203.416259765625, 2247.03515625, 2310.74755859375, 2322.708984375, 2211.968505859375]
# Save the updated configuration back to the YAML file
save_yml(locust_cfg,locust_cfg_path)

Data saved to InstaGeo-E2E-Geospatial-ML/instageo/model/configs/locust.yaml.


In [20]:
import os 
cwd = os.getcwd() 
print(cwd)

/home/krschap/foss/geoaihack2024/geo-ai-hack


In [21]:
import torch

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
	
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")


Is CUDA supported by this system? True
CUDA version: 12.4
ID of current CUDA device:0
Name of current CUDA device:NVIDIA GeForce RTX 4090 Laptop GPU


In [27]:
# Train the InstaGeo model using the Locust configuration
!python -m instageo.model.run  --config-name=locust \
    hydra.run.dir=cwd_15 \
    root_dir="." \
    train.batch_size=10 \
    train.num_epochs=20 \
    mode=train \
    train_filepath="train_split.csv" \
    valid_filepath="validation_split.csv"

Seed set to 1042
[2025-02-05 09:11:17,809][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 09:11:17,810][__main__][INFO] - Imported hydra config:
checkpoint_path: null
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
  - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
  img_size: 256
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
  - 2862.867431640625
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
  - 2372.204345703125
  - 2398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
- 1267.7974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
mode: train
model:
  freeze_backbone: false
  num_classes: 2
output_dir: null
r

### Run Model Evaluation
 To evaluate the model, adjust the `checkpoint_path` argument to point to the desired model checkpoint. The checkpoint file is typically located in the `hydra.run.dir` directory and is named `instageo_best_checkpoint.ckpt`.
For example:
```
/kaggle/working/outputs/first_run/instageo_best_checkpoint.ckpt 
```
Make sure to provide the correct path to the checkpoint file based on your training output directory.

In [28]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="." \
    test_filepath="validation_split.csv" \
    train.batch_size=16 \
    checkpoint_path='cwd_15/instageo_best_checkpoint-v1.ckpt' \
    mode=eval

Seed set to 1042


[2025-02-05 11:52:53,190][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 11:52:53,191][__main__][INFO] - Imported hydra config:
best_checkpoint-v1.ckpt/instageo_
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
 - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
256mg_size: 
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
2862.867431640625
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
45703125.2043
  - 2398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
7.7974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
vale: e
model:
  freeze_backbone: false
  num_classes: 2
output_dir: null
root_dir: .
std:
921875.305419


ights_path, map_location="cpu")


[2025-02-05 11:52:58,933][root][INFO] - GPU is available. Using GPU...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/krschap/foss/geoaihack2024/env/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 196/196 [00:55<00:00,  3.51it/s]
━━━━━━━━━━━━━━━━━━━━━━━┓━━━━┳━━━━
 metric       [0m[1m [0m┃[1m [0m[1m      DataLoader 0       [0m[1m [0m┃
━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━���━━━━━━━━━━━━━━━━━┩
[0m[36m [0m│[35m [0m[35m   0.8306564092636108    [0m[35m [0m│
t_Acc_1_epoch     [0m[36m [0m│[35m [0m[35m   0.7949793934822083    [0m[35m [0m│
m [0m[36m    test_IoU_0_epoch     [0m[36m [0m│[35m [0m[35m   0.6850273609161377    [0m[35m [0m│
3095703    [0m[35m [0m│oU_1_epoch     [0m[36m [0m│[35m [0m[35m   0.686998367
[35m   0.7983395457267761    [0m[35m [0m│m[36m [0m│[35m [0m
[0m│[35m [0m[35m   0.8196732401847839    [0m[35m [0m│
epoch   [0m[36m [0m│[35m [0m[35m   0.8306564092636108    [0m[35m [0m│
m   test_Recall_1_epoch   [0m[36m [0m│[35m [0m[35m   0.7949793934822083    [0m[35m [0m│
 [0m[35m [0m│     test_aAcc_epoch     [0m[36m [0m│[35m [0m[35m   0.8183152079582214   
2126

## Make Submission

We first run inference on test chips to get the predictions

In [29]:
%%bash
python -m instageo.model.run --config-name=locust \
    root_dir="." \
    test_filepath="test_ds.csv" \
    train.batch_size=16 \
    checkpoint_path='cwd_15/instageo_best_checkpoint-v1.ckpt' \
    output_dir='cwd_15/predictions' \
    mode=chip_inference

Seed set to 1042


[2025-02-05 11:54:05,835][__main__][INFO] - Script: /home/krschap/foss/geoaihack2024/geo-ai-hack/InstaGeo-E2E-Geospatial-ML/instageo/model/run.py
[2025-02-05 11:54:05,836][__main__][INFO] - Imported hydra config:
best_checkpoint-v1.ckpt/instageo_
dataloader:
  bands:
  - 0
  - 1
  - 2
  - 3
  - 4
  - 5
  - 6
  - 7
  - 8
 - 9
  - 10
  - 11
  - 12
  - 13
  - 14
  - 15
  - 16
  - 17
  constant_multiplier: 1.0
256mg_size: 
  mean:
  - 623.2724609375
  - 1247.657958984375
  - 1772.24169921875
  - 2371.256103515625
2862.867431640625
  - 2357.759765625
  no_data_value: -9999
  reduce_to_zero: false
  replace_label:
  - -9999
  - -1
  std:
  - 2182.050048828125
  - 2248.420654296875
  - 2302.53515625
45703125.2043
  - 2398.52685546875
  - 2292.96435546875
  temporal_dim: 3
mean:
- 670.5441284179688
7.7974853515625
- 1772.599365234375
- 2415.69091796875
- 2879.2431640625
- 2337.822509765625
hip_inference
model:
  freeze_backbone: false
  num_classes: 2
output_dir: cwd_15/predictions
ir: .d
std:

ights_path, map_location="cpu")


[2025-02-05 11:54:10,021][root][INFO] - GPU is available. Using GPU...


Running Inference: 100%|██████████| 151/151 [00:53<00:00,  2.82it/s]


After getting the prdictions for each chip, we retrieve the predicted value for each observatio in our test split.

In [30]:
import os 
predictions_directory = "cwd_15/predictions"
prediction_files = os.listdir(predictions_directory)

def get_prediction_value(row):
    matching_files = [f for f in prediction_files if (str(row['date']) in f) and (row['mgrs_tile_id'] in f)]
    if not matching_files:
        return (np.nan, np.nan)
    for file in matching_files:
        with rasterio.open(f"{predictions_directory}/{file}") as src:
            width, height = src.width, src.height
            affine_transform = rasterio.transform.AffineTransformer(src.transform)
            transformer = Transformer.from_crs(CRS.from_epsg(4326), src.crs, always_xy=True)
            x_chip, y_chip = transformer.transform(row['x'], row['y'])
            x_offset, y_offset = affine_transform.rowcol(x_chip, y_chip)
            
            if 0 <= x_offset < width and 0 <= y_offset < height:
                prediction_value =  src.read(1)[y_offset, x_offset]
                return (round(float(prediction_value), 3), file)  # **Round to 3 decimals**
    return (np.nan, np.nan)

In [31]:
submission_df = pd.read_csv("test.csv")

print(submission_df)

submission_df[['prediction', 'filename']] = submission_df.apply(get_prediction_value, axis=1, result_type='expand')
submission_df[["id","prediction"]].to_csv("hls_submission_cwd_15.csv",index=False)

           id          x          y      date mgrs_tile_id
0        ID_0  43.331667   7.276944  20210501        38NLP
1        ID_1  44.902500  15.438056  20210601        38PMC
2        ID_2  44.429167  16.330833  20210601        38QMD
3        ID_3  65.844167  29.278611  20210201        41RQN
4        ID_4  38.735556  17.148056  20210101        37QDU
...       ...        ...        ...       ...          ...
3213  ID_3213  45.377500   7.028333  20211001        38NNN
3214  ID_3214  43.135833  15.619444  20210201        38PLC
3215  ID_3215  57.328611  25.895278  20210101        40REP
3216  ID_3216  62.626667  25.923611  20210101        41RMJ
3217  ID_3217  47.930278  14.035556  20210301        38PRA

[3218 rows x 5 columns]
