# Create Target Files

## File System Structure


## Input

DHS data is used as the basis for creating partition maps for each country based on the location of clusters. 

<pre style="font-family: monospace;">
./GIS-Image-Stack-Processing
    /DHS
        /County specific folders containing DHS files
</pre>

## Output
<pre style="font-family: monospace;">
./GIS-Image-Stack-Processing
    /AOI/
        PK/
            Targets/
                <span style="color: blue;">targets.json</span> 
        TD/
             Targets/
                <span style="color: blue;">targets.json</span> 

</pre>

## Required Configurations

<pre style="font-family: monospace;">
<span style="color: blue;">country_code  = 'PK'</span>      # Set the country code
</pre>

In [1]:
#-------------------------------------------------
# REQUIRED CONFIGURATIONS HERE
#-------------------------------------------------
country_code  = 'TD'      # Set the country code
#-------------------------------------------------

In [5]:
import os
# import time
import sys
# import re
# import copy
# import numpy as np
# import math
# import random
# import glob as glb
import json
from enum import Enum

# from collections import defaultdict

import pandas as pd
import geopandas as gpd

from dataclasses import dataclass

In [6]:
sys.path.append('./GIS-Image-Stack-Processing')  # Adjust path if `gist_utils` is moved
# Import module that contains several convenience functions (e.g., gdal wrappers)
from gist_utils import *

from gist_utils.aoi_configurations import aoi_configurations

## Dataset and Results Configuration

In [7]:
# Enumerated list of DHS target values
@dataclass(frozen=True)
class TargetType(Enum):
    FRACTION_DPT3_VACCINATED  = "fraction_dpt3_vaccinated"
    FRACTION_WITH_ELECTRICITY = "fraction_with_electricity"
    FRACTION_WITH_FRESH_WATER = "fraction_with_fresh_water"
    MEAN_WEALTH_INDEX         = "mean_wealth_index"

In [8]:
GIS_ROOT = './GIS-Image-Stack-Processing'
PRT_ROOT = './GIS-Image-Stack-Processing/AOI/Partitions'

target_json_path = os.path.join(GIS_ROOT, f'AOI/{country_code}/Targets/targets.json')

## DHS Data Configuration

In [9]:
shapefile_path = os.path.join(GIS_ROOT, aoi_configurations[country_code]['shapefile'])
recode_hr_path = os.path.join(GIS_ROOT, aoi_configurations[country_code]['recode_hr'])
recode_kr_path = os.path.join(GIS_ROOT, aoi_configurations[country_code]['recode_kr'])

# DHS Column Headings
dhs_cluster_field  = 'DHSCLUST'
dhs_lat_field      = 'LATNUM'
dhs_lon_field      = 'LONGNUM'

# Map Heading to new names
cluster_id   = 'cluster_id'
cluster_lat  = 'lat'
cluster_lon  = 'lon'

# The following mappings are used to rename DHS column headings to more meaningful names
cluster_column_mapping = {
    dhs_cluster_field: cluster_id,
    dhs_lat_field: cluster_lat,
    dhs_lon_field: cluster_lon
}

# DHS Household recode column name mapping
hr_column_mapping = {
    'HV001': cluster_id,
    'HV201': 'water_access',
    'HV206': 'electricity_access',
    'HV208': 'radio_access',
    'HV209': 'television_access',
    'HV270': 'wealth_index'
}

# DHS Child recode column name mapping
kr_column_mapping = {
    'V001': cluster_id,
    'H7': 'dpt1',
    'H8': 'dpt2',
    'H9': 'dpt3'
}

## Extract DHS Cluster Data

In [10]:
cluster_df, erroneous_cluster_ids = extract_cluster_data(shapefile_path, dhs_cluster_field, dhs_lat_field, dhs_lon_field)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data[cluster_field] =         cluster_data[cluster_field].astype(float).astype(int)


In [11]:
print(erroneous_cluster_ids)

[]


In [12]:
# Use the mapping to select and rename columns
cluster_df = cluster_df[list(cluster_column_mapping.keys())].rename(columns=cluster_column_mapping)

print(cluster_df.head())
print(cluster_df.shape[0])

   cluster_id        lat        lon
0           1   9.453506  18.944837
1           2  13.466001  22.196232
2           3  15.749017  18.285138
3           4  12.135490  15.206105
4           5   9.264128  16.400491
624


In [13]:
print(len(cluster_df))
last_cluster_id = cluster_df[cluster_id].iloc[-1]
print(last_cluster_id)

624
626


In [14]:
cluster_df[cluster_id] = cluster_df[cluster_id].astype(str)  # Convert to string
print(cluster_df)

    cluster_id        lat        lon
0            1   9.453506  18.944837
1            2  13.466001  22.196232
2            3  15.749017  18.285138
3            4  12.135490  15.206105
4            5   9.264128  16.400491
..         ...        ...        ...
619        622  17.939057  19.013390
620        623  12.457152  21.067211
621        624  17.917014  19.102875
622        625  21.822564  17.449141
623        626  16.070734  22.876269

[624 rows x 3 columns]


## Load DHS Household Recode Data

In [15]:
def load_hr_data():
    hr_df = pd.read_spss(recode_hr_path)
    return hr_df

In [16]:
# Load the SPSS file into a DataFrame
hr_df = load_hr_data()

# Use the mapping to select and rename columns
hr_df = hr_df[list(hr_column_mapping.keys())].rename(columns=hr_column_mapping)

# Convert the cluster ID column to integers
hr_df[cluster_id] = hr_df[cluster_id].astype(float).astype(int)

print(hr_df.head())

   cluster_id        water_access electricity_access radio_access  \
0           1    Unprotected well                 No           No   
1           1  Piped to yard/plot                 No          Yes   
2           1  Piped to yard/plot                 No          Yes   
3           1    Unprotected well                 No           No   
4           1  Piped to yard/plot                 No           No   

  television_access wealth_index  
0                No      Richest  
1                No      Richest  
2                No      Richest  
3                No      Richest  
4                No      Richest  


In [17]:
print(hr_df.index.nunique())

17233


In [18]:
def remove_clusters(df, cluster_ids_to_remove, cluster_id_column='cluster_id'):
    
    # Ensure the cluster ID is treated as a column, whether it is currently an index or not
    if cluster_id_column in df.index.names:
        df = df.reset_index()

    # Filter out rows where the cluster ID is in the list to remove
    df_filtered = df[~df[cluster_id_column].isin(cluster_ids_to_remove)]
    
    return df_filtered

## DHS: Fresh Water Access

In [19]:
# Define the categories that represent fresh water sources
fresh_water_categories = [
    'Protected spring',
    'Public tap/standpipe',
    'Tube well or borehole',
    'Piped to yard/plot',
    'Bottled water',
    'Filtration plant',
    'Rainwater'
]

# Calculate the fraction of households with access to fresh water for each cluster
fraction_with_fresh_water_df = hr_df.groupby(cluster_id)['water_access'].apply(
    lambda x: (x.isin(fresh_water_categories)).mean()
).reset_index()

# Rename the columns for clarity
fraction_with_fresh_water_df.columns = [cluster_id, TargetType.FRACTION_WITH_FRESH_WATER.value]

# Filter out erroneous cluster IDs
fraction_with_fresh_water_df = remove_clusters(fraction_with_fresh_water_df, 
                                               erroneous_cluster_ids, 
                                               cluster_id_column=cluster_id)

# Display the result
print(fraction_with_fresh_water_df)
print(fraction_with_fresh_water_df.shape[0])
print(fraction_with_fresh_water_df.index.nunique())

     cluster_id  fraction_with_fresh_water
0             1                   0.400000
1             2                   1.000000
2             3                   0.966667
3             4                   0.900000
4             5                   0.965517
..          ...                        ...
619         622                   0.280000
620         623                   0.964286
621         624                   0.571429
622         625                   0.583333
623         626                   0.217391

[624 rows x 2 columns]
624
624


## DHS: Electricity Access

In [20]:
# Calculate the fraction of households with electricity for each cluster and create a DataFrame
fraction_with_electricity_df = hr_df.groupby(cluster_id)['electricity_access'].apply(lambda x: (x == 'Yes').mean()).reset_index()

# Rename the columns for clarity
fraction_with_electricity_df.columns = [cluster_id, TargetType.FRACTION_WITH_ELECTRICITY.value]

# Filter out erroneous cluster IDs
fraction_with_electricity_df = remove_clusters(fraction_with_electricity_df, 
                                               erroneous_cluster_ids, 
                                               cluster_id_column=cluster_id)

print(fraction_with_electricity_df)
print(fraction_with_electricity_df.shape[0])
print(fraction_with_electricity_df.index.nunique())

     cluster_id  fraction_with_electricity
0             1                   0.040000
1             2                   0.095238
2             3                   0.000000
3             4                   0.000000
4             5                   0.034483
..          ...                        ...
619         622                   0.720000
620         623                   0.000000
621         624                   0.761905
622         625                   0.416667
623         626                   0.130435

[624 rows x 2 columns]
624
624


In [21]:
print(hr_df.head())  # Check if the columns were renamed correctly

   cluster_id        water_access electricity_access radio_access  \
0           1    Unprotected well                 No           No   
1           1  Piped to yard/plot                 No          Yes   
2           1  Piped to yard/plot                 No          Yes   
3           1    Unprotected well                 No           No   
4           1  Piped to yard/plot                 No           No   

  television_access wealth_index  
0                No      Richest  
1                No      Richest  
2                No      Richest  
3                No      Richest  
4                No      Richest  


## DHS: Wealth Index

In [22]:
# Define the mapping for wealth index categories to floating-point values
wealth_index_mapping = {
    'Poorest': 0.0,
    'Poorer': 0.25,
    'Middle': 0.5,
    'Richer': 0.75,
    'Richest': 1.0
}

# Replace original wealth index categories with corresponding floating-point values and convert to float
hr_df['wealth_index'] = hr_df['wealth_index'].map(wealth_index_mapping).astype(float)
print(hr_df.head()) 

   cluster_id        water_access electricity_access radio_access  \
0           1    Unprotected well                 No           No   
1           1  Piped to yard/plot                 No          Yes   
2           1  Piped to yard/plot                 No          Yes   
3           1    Unprotected well                 No           No   
4           1  Piped to yard/plot                 No           No   

  television_access  wealth_index  
0                No           1.0  
1                No           1.0  
2                No           1.0  
3                No           1.0  
4                No           1.0  


In [23]:
# Calculate the mean wealth index for each cluster
mean_wealth_by_cluster_df = hr_df.groupby(cluster_id)['wealth_index'].mean().reset_index()

# Rename columns for clarity
mean_wealth_by_cluster_df.columns = [cluster_id, TargetType.MEAN_WEALTH_INDEX.value]


# Filter out erroneous cluster IDs
mean_wealth_by_cluster_df = remove_clusters(mean_wealth_by_cluster_df, 
                                            erroneous_cluster_ids, 
                                            cluster_id_column=cluster_id)

# Display the table
print(mean_wealth_by_cluster_df)
print(mean_wealth_by_cluster_df.shape[0])
print(mean_wealth_by_cluster_df.index.nunique())

     cluster_id  mean_wealth_index
0             1           0.920000
1             2           0.547619
2             3           0.425000
3             4           0.558333
4             5           0.258621
..          ...                ...
619         622           0.980000
620         623           0.160714
621         624           0.619048
622         625           0.760417
623         626           0.163043

[624 rows x 2 columns]
624
624


## Load DHS Child Recode Data

In [24]:
def load_kr_data():
    kr_df = pd.read_spss(recode_kr_path)
    return kr_df

In [25]:
# Load the SPSS file into a DataFrame
kr_df = load_kr_data()

# Use the mapping to select and rename columns
kr_df = kr_df[list(kr_column_mapping.keys())].rename(columns=kr_column_mapping)

# Convert the cluster ID column to integers
kr_df[cluster_id] = kr_df[cluster_id].astype(float).astype(int)

print(kr_df.head())

   cluster_id                      dpt1                      dpt2  \
0           1  Vaccination date on card  Vaccination date on card   
1           1        Reported by mother        Reported by mother   
2           1        Reported by mother        Reported by mother   
3           1        Reported by mother        Reported by mother   
4           1        Reported by mother                       NaN   

                       dpt3  
0  Vaccination date on card  
1                        No  
2        Reported by mother  
3        Reported by mother  
4        Reported by mother  


## DHS: DPT3 Vaccination

In [26]:
# Filter out children who have a "No" response in any of the three doses
fully_vaccinated = kr_df[(kr_df['dpt1'] != 'No') & (kr_df['dpt2'] != 'No') & (kr_df['dpt3'] != 'No')]

# Group by Cluster ID (v001) and compute the fraction fully vaccinated for each cluster
fraction_dpt3_vaccinated = fully_vaccinated.groupby(cluster_id).size() / kr_df.groupby(cluster_id).size()

fraction_dpt3_vaccinated_df = fraction_dpt3_vaccinated.to_frame().reset_index()

# Rename columns appropriately if needed
fraction_dpt3_vaccinated_df.columns = [cluster_id, TargetType.FRACTION_DPT3_VACCINATED.value]


# Fill any NaN values with 0 (for clusters with no fully vaccinated children)
fraction_dpt3_vaccinated_df = fraction_dpt3_vaccinated_df.fillna(0)

fraction_dpt3_vaccinated_df = remove_clusters(fraction_dpt3_vaccinated_df, 
                                              erroneous_cluster_ids, 
                                              cluster_id_column=cluster_id)

print(fraction_dpt3_vaccinated_df)
print(fraction_dpt3_vaccinated_df.shape[0])
print(fraction_dpt3_vaccinated_df.index.nunique())

     cluster_id  fraction_dpt3_vaccinated
0             1                  0.428571
1             2                  0.269231
2             3                  0.062500
3             4                  0.354839
4             5                  0.333333
..          ...                       ...
619         622                  0.086957
620         623                  0.100000
621         624                  0.470588
622         625                  0.055556
623         626                  0.000000

[624 rows x 2 columns]
624
624


## Add DHS Target Values to Data Frame

In [27]:
cluster_df[cluster_id] = cluster_df[cluster_id].astype(str)
fraction_dpt3_vaccinated_df[cluster_id]  = fraction_dpt3_vaccinated_df[cluster_id].astype(str)
fraction_with_electricity_df[cluster_id] = fraction_with_electricity_df[cluster_id].astype(str)
fraction_with_fresh_water_df[cluster_id] = fraction_with_fresh_water_df[cluster_id].astype(str)
mean_wealth_by_cluster_df[cluster_id]    = mean_wealth_by_cluster_df[cluster_id].astype(str)

dhs_df = pd.merge(cluster_df, fraction_dpt3_vaccinated_df,  on=cluster_id)
dhs_df = pd.merge(dhs_df,     fraction_with_electricity_df, on=cluster_id)
dhs_df = pd.merge(dhs_df,     fraction_with_fresh_water_df, on=cluster_id)
dhs_df = pd.merge(dhs_df,     mean_wealth_by_cluster_df,    on=cluster_id)

In [28]:
print(dhs_df)

    cluster_id        lat        lon  fraction_dpt3_vaccinated  \
0            1   9.453506  18.944837                  0.428571   
1            2  13.466001  22.196232                  0.269231   
2            3  15.749017  18.285138                  0.062500   
3            4  12.135490  15.206105                  0.354839   
4            5   9.264128  16.400491                  0.333333   
..         ...        ...        ...                       ...   
619        622  17.939057  19.013390                  0.086957   
620        623  12.457152  21.067211                  0.100000   
621        624  17.917014  19.102875                  0.470588   
622        625  21.822564  17.449141                  0.055556   
623        626  16.070734  22.876269                  0.000000   

     fraction_with_electricity  fraction_with_fresh_water  mean_wealth_index  
0                     0.040000                   0.400000           0.920000  
1                     0.095238                   

## Create `targets.json` to Store Cluster Data and Target Values

In [29]:
def generate_target_values_json(target_values_dict, output_file='targets.json'):
    
    # Ensure the directory exists
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)
        print(f"Created directory {output_dir}")
    
    # Convert cluster IDs to integers and round target values to 3 significant digits
    target_values_dict = {int(float(cluster_id)): {key: round(value, 3) for key, value in targets.items()}
                          for cluster_id, targets in target_values_dict.items()}

    # Save the JSON file
    with open(output_file, 'w') as f:
        json.dump(target_values_dict, f, indent=4)

    print(f"Target values JSON saved to {output_file}")

In [30]:
target_types = [
    TargetType.FRACTION_DPT3_VACCINATED.value,
    TargetType.FRACTION_WITH_ELECTRICITY.value,
    TargetType.FRACTION_WITH_FRESH_WATER.value,
    TargetType.MEAN_WEALTH_INDEX.value
]

target_values_dict = dhs_df.set_index(cluster_id)[target_types].to_dict('index')

generate_target_values_json(target_values_dict, target_json_path)

Target values JSON saved to ./GIS-Image-Stack-Processing/AOI/TD/Targets/targets.json
