# Partition Map Creation

This notebook uses the DHS cluster data to partion the clusters into train and validation segments.


## File System Structure

## Input

DHS data is used as the basis for creating partition maps for each country based on the location of clusters. 

<pre style="font-family: monospace;">
./GIS-Image-Stack-Processing
    /DHS
        /County specific folders containing DHS files
</pre>

## Output
<pre style="font-family: monospace;">
./GIS-Image-Stack-Processing
    /AOI/
        Partitions/
            PK/
                <span style="color: blue;">PK_all.json</span> 
                <span style="color: blue;">PK_train.json</span> 
                <span style="color: blue;">PK_valid.json</span> 
            TD/
                <span style="color: blue;">TD_all.json</span> 
                <span style="color: blue;">TD_train.json</span> 
                <span style="color: blue;">TD_valid.json</span> 
</pre>


## Required Configurations

The following configuration is required for each execution of this notebook: the two-letter country code.

<pre style="font-family: monospace;">
<span style="color: blue;">country_code  = 'PK'</span>      # Set the country code
</pre>

In [59]:
#-------------------------------------------------
# REQUIRED CONFIGURATIONS HERE
#-------------------------------------------------
country_code  = 'AM'      # Set the country code

In [60]:
import os
import sys
import json

In [61]:
sys.path.append('./GIS-Image-Stack-Processing')  # Adjust path if `gist_utils` is moved
# Import module that contains several convenience functions (e.g., gdal wrappers)
from project_utils import *

from project_utils.aoi_configurations import aoi_configurations

In [62]:
GIS_ROOT = './GIS-Image-Stack-Processing'
PRT_ROOT = './GIS-Image-Stack-Processing/AOI/Partitions'

# Check and create GIS_ROOT if it doesn't exist
if not os.path.exists(GIS_ROOT):
    os.makedirs(GIS_ROOT)
    print(f"Directory '{GIS_ROOT}' created.")

# Check and create PRT_ROOT if it doesn't exist
if not os.path.exists(PRT_ROOT):
    os.makedirs(PRT_ROOT)
    print(f"Directory '{PRT_ROOT}' created.")
    

json_file = f'./GIS-Image-Stack-Processing/AOI/{country_code}/Targets/targets.json'

In [63]:
train_partition = os.path.join(PRT_ROOT, f'{country_code}', f'{country_code}_train.json')
valid_partition = os.path.join(PRT_ROOT, f'{country_code}', f'{country_code}_valid.json')
all_partition   = os.path.join(PRT_ROOT, f'{country_code}', f'{country_code}_all.json')

## DHS Data Configuration

In [64]:
shapefile_path = os.path.join(GIS_ROOT, aoi_configurations[country_code]['shapefile'])

# DHS Column Headings
dhs_cluster_field  = 'DHSCLUST'
dhs_lat_field      = 'LATNUM'
dhs_lon_field      = 'LONGNUM'

# Map Heading to new names
cluster_id   = 'cluster_id'
cluster_lat  = 'lat'
cluster_lon  = 'lon'

# The following mappings are used to rename DHS column headings to more meaningful names
cluster_column_mapping = {
    dhs_cluster_field: cluster_id,
    dhs_lat_field: cluster_lat,
    dhs_lon_field: cluster_lon
}

## Extract DHS Cluster Data

In [65]:
cluster_df, erroneous_cluster_ids = extract_cluster_data(shapefile_path, dhs_cluster_field, dhs_lat_field, dhs_lon_field)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data[cluster_field] =         cluster_data[cluster_field].astype(float).astype(int)


In [66]:
print(erroneous_cluster_ids)

[]


In [67]:
# Use the mapping to select and rename columns
cluster_df = cluster_df[list(cluster_column_mapping.keys())].rename(columns=cluster_column_mapping)

print(cluster_df.head())
print(cluster_df.shape[0])

   cluster_id        lat        lon
0           1  40.208171  44.471346
1           2  40.214641  44.474167
2           3  40.205280  44.452329
3           4  40.223645  44.487648
4           5  40.220146  44.480563
313


# Create Partition Maps

This function creates a partition map file that specifies which cluster IDs are to be used for the 
given partiion. An input longitude threshold is currently used to partition data between train and validation.

In [68]:
def generate_partition_maps_from_json(json_file, country_code, longitude_threshold, output_train='train.json', output_valid='valid.json', output_all='all.json'):
    """
    Generates partition maps (train, valid, all) using cluster IDs and coordinates from a JSON file.

    Parameters:
        json_file (str): The path to the targets.json file.
        country_code (str): The country code to be used as a key in the partition maps.
        longitude_threshold (float): The longitude threshold to split the data into train and valid partitions.
        output_train (str): The output path for the train partition map. Default is 'train.json'.
        output_valid (str): The output path for the valid partition map. Default is 'valid.json'.
        output_all (str): The output path for the all partition map. Default is 'all.json'.
    """
    
    # Load the cluster data from the JSON file
    with open(json_file, 'r') as f:
        cluster_data = json.load(f)

    # Extract clusters, ignoring the 'metadata' section
    clusters = cluster_data.get("clusters", {})

    # Initialize lists for training, validation, and all partition maps
    train_partition = []
    valid_partition = []
    all_partition = [int(cid) for cid in clusters.keys()]  # Convert all cluster IDs to integers

    # Assign cluster IDs to the appropriate partition based on the longitude threshold
    for cid, data in clusters.items():
        lon = data['lon']
        if lon < longitude_threshold:
            train_partition.append(int(cid))  # Convert to int when appending
        else:
            valid_partition.append(int(cid))  # Convert to int when appending

    # Prepare dictionary structures for JSON
    train_partition_map = {f"{country_code}": train_partition}
    valid_partition_map = {f"{country_code}": valid_partition}
    all_partition_map = {f"{country_code}": all_partition}

    # Ensure directory exists before saving JSON files
    for output_file in [output_train, output_valid, output_all]:
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)

    # Save the train partition map to a JSON file
    with open(output_train, 'w') as f:
        json.dump(train_partition_map, f, indent=4)
    print(f"Train partition map saved to: {output_train}")

    # Save the valid partition map to a JSON file
    with open(output_valid, 'w') as f:
        json.dump(valid_partition_map, f, indent=4)
    print(f"Valid partition map saved to: {output_valid}")

    # Save the all partition map to a JSON file
    with open(output_all, 'w') as f:
        json.dump(all_partition_map, f, indent=4)
    print(f"All partition map saved to: {output_all}")


In [69]:
crs_lon = aoi_configurations[country_code]['crs_lon']
aoi_lon_east = aoi_configurations[country_code]['lon_east']


country_longitude_offsets = {
    'AM':  0.2,
    'IN':  3.5,
    'JO': -1.0,
    'MA':  4.0,
    'MB':  0.87,
    'ML':  0.3,
    'MR':  0.3,
    'NI':  1.15,
    'PK':  4.75,
    'SN': -0.15,
    'TD':  2.2
}


longitude_threshold = crs_lon + country_longitude_offsets.get(country_code, 0)

generate_partition_maps_from_json(json_file, 
                                  country_code, 
                                  longitude_threshold,
                                  train_partition, 
                                  valid_partition,
                                  all_partition)

Train partition map saved to: ./GIS-Image-Stack-Processing/AOI/Partitions/AM/AM_train.json
Valid partition map saved to: ./GIS-Image-Stack-Processing/AOI/Partitions/AM/AM_valid.json
All partition map saved to: ./GIS-Image-Stack-Processing/AOI/Partitions/AM/AM_all.json
