In [36]:
# set autoreload
%load_ext autoreload
%autoreload 1
%aimport cgf_utils
%aimport mf
%aimport paths
%aimport income_funcs

import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rasterio as rio
import rasterra as rt
from pathlib import Path
import geopandas as gpd
from pymer4 import Lmer
import glob
import logging
import pickle
import sys
import logging
from scipy.special import expit

from location_mapping import load_fhs_lsae_mapping
#from income_funcs import load_binned_income_distribution_proportions


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
GLOBAL_POPULATION_FILEPATH = '/mnt/team/rapidresponse/pub/population/data/01-raw-data/other-gridded-pop-projects/global-human-settlement-layer/2020/GHS_POP_E2020_GLOBE_R2023A_4326_30ss_V1_0.tif'

In [38]:
measure = 'stunting'
fhs_location_id = 97 #98 #Chile
scenario = 'ssp119'
year_id = 2075
sex_id = 2
age_group_id = 4
model_identifier = 're_grid_o30'
#model_filepath = paths.MODEL_ROOTS / model_identifier / f'model_{measure}_{age_group_id}_{sex_id}.pkl'
model_filepath = '/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/output/models/model.pkl'
with open(model_filepath, 'rb') as f:
    model = pickle.load(f)

In [39]:
loc_mapping = load_fhs_lsae_mapping(fhs_location_id)
fhs_shapefile = loc_mapping.iloc[0].fhs_shape
location_iso3 = loc_mapping.iloc[0].worldpop_iso3
simple_loc_mapping = loc_mapping[['fhs_location_id', 'lsae_location_id']]
income_df = income_funcs.load_binned_income_distribution_proportions(fhs_location_id=fhs_location_id, measure= measure, year_id = year_id) #and year

In [40]:
fhs_pop_raster = rt.load_raster(GLOBAL_POPULATION_FILEPATH, fhs_shapefile.bounds).set_no_data_value(np.nan)
fhs_pop_raster = fhs_pop_raster.clip(fhs_shapefile)

In [41]:
possible_climate_variables = ['temp', 'precip', 'over_30']
climate_vars_to_match_bins = [v for v in model.vars_to_bin if v in possible_climate_variables]
continuous_climate_vars = [v for v in ['temp', 'over_30'] if (v in possible_climate_variables) and (v not in climate_vars_to_match_bins) and (v in model.model_vars)]
climate_vars = [x for x in possible_climate_variables if x in set(model.model_vars + model.vars_to_bin)]


In [42]:
model.var_info['ihme_loc_id']['coefs'].rename(columns={'ihme_loc_id': 'worldpop_iso3'}, inplace=True)

In [43]:
climate_rasters = {}
for var in climate_vars:
    climate_rasters[var] = mf.get_climate_variable_raster(scenario, year_id, var, None, None, untreated=True)
    climate_rasters[var] = climate_rasters[var].resample_to(fhs_pop_raster).clip(fhs_shapefile)

In [44]:
#def get_climate_variable_raster(location_iso3, scenario, year, climate_var, shapefile, reference_raster, nodata = np.nan, untreated=False):

admin_dfs = []
for _, admin2_row in loc_mapping.iterrows():
    lsae_location_id = admin2_row.lsae_location_id
    lsae_shapefile = admin2_row.lsae_shape

    pop_raster = fhs_pop_raster.clip(lsae_shapefile).mask(lsae_shapefile)
    pop_array = pop_raster.set_no_data_value(np.nan).to_numpy()

    rasters = {'population': pop_array.flatten()}

    for var in climate_vars_to_match_bins:
        climate_raster = climate_rasters[var].clip(lsae_shapefile).mask(lsae_shapefile)
        climate_array = climate_raster.to_numpy()
        assert pop_array.shape == climate_array.shape
        binned_climate_array = np.digitize(climate_array, model.var_info[var]['bin_edges'], right=False) - 1
        rasters[var+'_bin_idx'] = binned_climate_array.flatten()

    #climate_raster = big_climate_raster.clip(lsae_shapefile).mask(lsae_shapefile)#.resample_to(pop_raster)
    for var in continuous_climate_vars:
        climate_raster = climate_rasters[var].clip(lsae_shapefile).mask(lsae_shapefile)
        climate_array = climate_raster.to_numpy()
        assert pop_array.shape == climate_array.shape
        rasters[var] = climate_array.flatten()

    # Alternative approach is to group pixels to lsae
    #temp_df = pd.DataFrame({'pop': pop_array.flatten(), 'climate_bin_idx': binned_climate_array.flatten()}).groupby('climate_bin_idx', as_index=False).pop.sum()
    # Keeping it as pixels
    temp_df = pd.DataFrame(rasters)
    temp = temp_df.dropna(subset=['population'])
    temp_df['lsae_pop'] = np.nansum(pop_array)
    temp_df['lsae_location_id'] = lsae_location_id
    temp_df['worldpop_iso3'] = admin2_row.worldpop_iso3

    local_income_df = income_df.query('lsae_location_id == @lsae_location_id')
    for var in climate_vars_to_match_bins:
        temp_df = temp_df.merge(model.var_info[var]['bins'], left_on=var+'_bin_idx', right_index=True, how='inner')
    temp_df = temp_df.merge(local_income_df, on='lsae_location_id', how='left')

    # The lines from now on have coefficients and so are age_group and sex_id - specific
    # Parallelizing by them could happen here - These can be precomputed for a model
    if model.has_grid:
        temp_df = temp_df.merge(model.grid_spec['grid_definition'], how='left')
        temp_df = temp_df.merge(model.var_info['grid_cell']['coefs'], how='left')

    for var in continuous_climate_vars:
        temp_df = temp_df.merge(model.var_info[var]['coefs'], how='left')

    temp_df = temp_df.merge(model.var_info['ihme_loc_id']['coefs'], how='left', on='worldpop_iso3') 
    temp_df.ihme_loc_id_coef = temp_df.ihme_loc_id_coef.fillna(0)
    
    #build the logistic input one variable at a time
    temp_df['logistic_input'] = model.var_info['intercept']['coef']
    temp_df['logistic_input'] += temp_df['ihme_loc_id_coef']
    for var in climate_vars_to_match_bins:
        if var in model.grid_spec['grid_order']: continue
        temp_df['logistic_input'] += temp_df[var+'_bin_coef']
    for var in continuous_climate_vars:
        temp_df['logistic_input'] += temp_df[var+'_coef']
    if model.has_grid:
        temp_df['logistic_input'] += temp_df['grid_cell_coef']
    temp_df['prediction'] = expit(temp_df['logistic_input'])

    admin_dfs.append(temp_df)

fhs_df = pd.concat(admin_dfs)
#24s to 92s

In [50]:
fhs_df['population_at_income'] = fhs_df['population'] * fhs_df['proportion_at_income']
fhs_df['population_proportion_at_income'] = fhs_df['population_at_income'] / fhs_df['lsae_pop'] / len(loc_mapping)
fhs_df['affected_proportion'] = fhs_df['population_proportion_at_income'] * fhs_df['prediction']

result = pd.DataFrame({'fhs_location_id' : [fhs_location_id], 'year_id': [year_id], 'age_group_id': [age_group_id], 'scenario' : [scenario], 'sex_id':[sex_id],
    'prevalence': [fhs_df.affected_proportion.sum()]})

return result


Unnamed: 0,fhs_location_id,year_id,age_group_id,scenario,sex_id,prevalence
0,97,2075,4,ssp119,2,0.110334


In [None]:
fhs_df[fhs_df.population_proportion_at_income > 0.15]

Unnamed: 0,population,over_30_bin_idx,lsae_pop,lsae_location_id,worldpop_iso3,over_30_bin,fhs_location_id,year_id,ldi_pc_pd_bin,proportion_at_income,grid_cell,grid_cell_coef,ihme_loc_id_coef,logistic_input,prediction,population_at_income,population_proportion_at_income
4919,133581.807251,0,506632.504043,91990,IRN,"[0, 1)",142.0,2075,"(15.9, 179.6]",0.592751,"(15.9, 179.6]_[0, 1)",-0.517082,0.0,-2.17691,0.101843,79180.711706,0.156288
57719,16162.063662,14,57349.208756,91747,IRN,"[120, 250)",142.0,2075,"(15.9, 179.6]",0.610944,"(15.9, 179.6]_[120, 250)",-0.435418,0.0,-2.095246,0.10956,9874.117363,0.172175
10789,19283.831181,1,66307.100636,91854,IRN,"[1, 5)",142.0,2075,"(15.9, 179.6]",0.530045,"(15.9, 179.6]_[1, 5)",-0.668446,0.0,-2.328274,0.088808,10221.306542,0.154151
6729,16900.524475,0,50785.225142,91864,IRN,"[0, 1)",142.0,2075,"(15.9, 179.6]",0.541286,"(15.9, 179.6]_[0, 1)",-0.517082,0.0,-2.17691,0.101843,9148.01851,0.180131
12539,8125.774643,14,32590.389466,91827,IRN,"[120, 250)",142.0,2075,"(15.9, 179.6]",0.608687,"(15.9, 179.6]_[120, 250)",-0.435418,0.0,-2.095246,0.10956,4946.053777,0.151764
12549,10686.940258,14,32590.389466,91827,IRN,"[120, 250)",142.0,2075,"(15.9, 179.6]",0.608687,"(15.9, 179.6]_[120, 250)",-0.435418,0.0,-2.095246,0.10956,6505.002113,0.199599
12969,32764.290103,12,111803.991574,82024,IRN,"[100, 110)",142.0,2075,"(15.9, 179.6]",0.520428,"(15.9, 179.6]_[100, 110)",-0.531375,0.0,-2.191203,0.100543,17051.440202,0.152512


In [143]:
np.unique(pop_array, return_counts=True)

(array([0.00000000e+00, 1.48849115e-03, 1.90012440e-03, ...,
        1.86089746e+04, 2.19263581e+04,            nan]),
 array([2065,    1,    1, ...,    1,    1, 6210]))