# Sensitive analysis

In [2]:
import pandas as pd
import numpy as np
import pickle

import util

## Loading data

In [3]:
df_deaths = pd.read_csv('data/output/df_mortality.csv', index_col=0)
df_labeled_cluster = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)
df_deaths['cluster_label'] = df_labeled_cluster['cluster_label']

## Variable Sensitivity Analysis

In [4]:
df_difference_statistics = pd.DataFrame()
df_difference_coefficients = pd.DataFrame()

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']

for model_id in [10]:
    for i in range(5):
        period = list_periods[i]
        print('\n*** Period: ', period)
 
        print('\*** Model',model_id)
        print('===>Full model:')
        with open('models/model_'+str(model_id)+'_'+period+'.pkl', 'rb') as file:
            model_reference = pickle.load(file)
        # summarize_results(model_reference)
        model_reference_r2_cs = model_reference.pseudo_rsquared()
                
        for removed_variable in model_reference.params[1:].index:
            print('\n*** Removed variable: ', removed_variable)        
            with open('models/sensitivity_analysis/variable/model_' + str(model_id) +'_' + period +'_' + removed_variable + '.pkl', 'rb') as file:
                model_variable = pickle.load(file)        
            # summarize_results(model_variable)
            model_variable_r2 = model_variable.pseudo_rsquared()
            
            r2_cs_difference = model_variable_r2 - model_reference_r2_cs
            statistic_row = {'model_id':model_id, 'period': period, 'variable':removed_variable, 'r2_difference':r2_cs_difference}
            new_row = pd.DataFrame([statistic_row])
            df_difference_statistics = pd.concat([df_difference_statistics, new_row], ignore_index=True)
            
            for variable in model_variable.params.index:
                coefficient_difference = model_variable.params[variable] - model_reference.params[variable]
                coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
                coefficient_row = {'model_id': model_id, 'period': period, 'removed_variable': removed_variable, 'variable': variable, 'coefficient_difference': coefficient_difference, 'coefficient_relative_difference': coefficient_relative_difference}
                new_row = pd.DataFrame([coefficient_row])
                df_difference_coefficients = pd.concat([df_difference_coefficients, new_row], ignore_index=True)


*** Period:  2020_1
\*** Model 2
===>Full model:

*** Removed variable:  cluster_label_Urbanized with informal settlements

*** Removed variable:  cluster_label_Semi-urbanized

*** Removed variable:  cluster_label_Rural with high human development

*** Removed variable:  cluster_label_Rural with low human development

*** Removed variable:  percentage_population_age_range_60_more

*** Removed variable:  percentage_urban_population

*** Removed variable:  demographic_density

*** Removed variable:  percentage_male_population

*** Removed variable:  percentage_indigenous_population

*** Removed variable:  per_capita_income

*** Removed variable:  gini

*** Removed variable:  percentage_estimated_households_in_informal_settlements

*** Removed variable:  demographic_density_in_informal_settlements

*** Removed variable:  percentage_hospitalizations_diseases_inadequate_sanitation

*** Removed variable:  percentage_self_employed_workers

*** Removed variable:  unemployment_rate

*** Remove

In [5]:
df_difference_statistics['r2_difference_absolute'] = df_difference_statistics['r2_difference'].abs()
df_difference_statistics.sort_values(['model_id', 'period', 'r2_difference'], ascending=[True, True, True])[['model_id', 'period', 'variable', 'r2_difference']].round(3)

Unnamed: 0,model_id,period,variable,r2_difference
41,2,2020,delta_first_death_period,-0.242
25,2,2020,percentage_population_age_range_60_more,-0.006
26,2,2020,percentage_urban_population,-0.004
38,2,2020,percentage_workers_services,-0.004
29,2,2020,percentage_indigenous_population,-0.004
...,...,...,...,...
123,3,2022,Semi-urbanized,0.000
124,3,2022,Rural with high human development,0.000
125,3,2022,Urbanized with informal settlements,0.000
126,3,2022,Rural with low human development,0.000


In [6]:
df_difference_statistics[df_difference_statistics['model_id'] == 2].groupby(['period'])['r2_difference'].describe().round(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,21.0,-0.013,0.053,-0.242,-0.003,-0.001,-0.0,-0.0
2020_1,21.0,-0.017,0.067,-0.31,-0.002,-0.001,-0.0,-0.0
2020_2022,21.0,-0.006,0.016,-0.074,-0.005,-0.001,-0.0,-0.0
2021,21.0,-0.012,0.024,-0.101,-0.006,-0.001,-0.0,-0.0
2022,21.0,-0.019,0.06,-0.266,-0.002,-0.001,-0.0,-0.0


In [7]:
df_difference_statistics[df_difference_statistics['model_id'] == 3].groupby(['period'])['r2_difference'].describe().round(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020,6.0,-0.049,0.12,-0.295,0.0,0.0,0.0,0.0
2020_1,6.0,-0.063,0.146,-0.361,-0.004,-0.004,-0.004,-0.004
2020_2022,6.0,-0.014,0.035,-0.085,0.0,0.0,0.0,0.0
2021,6.0,-0.013,0.031,-0.076,0.0,0.0,0.0,0.0
2022,6.0,-0.042,0.103,-0.251,0.0,0.0,0.0,0.0


In [8]:
df_difference_coefficients['coefficient_relative_difference_absolute'] = df_difference_coefficients['coefficient_relative_difference'].abs()

In [9]:
df_difference_coefficients.groupby(['model_id','period','removed_variable'])[['coefficient_relative_difference_absolute']].mean().round(3).reset_index().sort_values(by=['model_id','period','coefficient_relative_difference_absolute'], ascending=[True, True, False])

Unnamed: 0,model_id,period,removed_variable,coefficient_relative_difference_absolute
4,2,2020,delta_first_death_period,1.225
14,2,2020,percentage_population_age_range_60_more,0.495
2,2,2020,cluster_label_Semi-urbanized,0.388
9,2,2020,per_capita_income,0.362
19,2,2020,percentage_workers_services,0.359
...,...,...,...,...
129,3,2022,Rural with high human development,1.255
133,3,2022,Urbanized with informal settlements,0.571
132,3,2022,Urbanized,0.444
131,3,2022,Semi-urbanized,0.371


In [10]:
df_difference_coefficients.groupby(['model_id','period','removed_variable'])[['coefficient_relative_difference_absolute']].median().round(3).reset_index().sort_values(by=['model_id','period','coefficient_relative_difference_absolute'], ascending=[True, True, False])

Unnamed: 0,model_id,period,removed_variable,coefficient_relative_difference_absolute
4,2,2020,delta_first_death_period,0.651
16,2,2020,percentage_urban_population,0.241
19,2,2020,percentage_workers_services,0.241
14,2,2020,percentage_population_age_range_60_more,0.215
18,2,2020,percentage_workers_industry,0.180
...,...,...,...,...
129,3,2022,Rural with high human development,1.280
133,3,2022,Urbanized with informal settlements,0.423
132,3,2022,Urbanized,0.351
131,3,2022,Semi-urbanized,0.309


## Parameter sensitivity analysis

In [20]:
df_difference_statistics = pd.DataFrame()
df_difference_coefficients = pd.DataFrame()

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']

for model_id in [9,10]:
    for i in range(5):
        period = list_periods[i]
        
        print('\n*** Period: ', period)
        print('\*** Model',model_id)
        print('===>Full model:')
        with open('models/model_'+str(model_id)+'_'+period+'.pkl', 'rb') as file:
            model_reference = pickle.load(file)        
        # summarize_results(model_reference)
        model_reference_r2_cs = model_reference.pseudo_rsquared('cs')
        model_reference_r2_mcf = model_reference.pseudo_rsquared('mcf')
        model_reference_llf = model_reference.llf
        model_reference_aic = model_reference.aic
        model_reference_bic = model_reference.bic_llf
                
        for sample in range(30):
            print('\n*** Sample: ', sample)        
            with open('models/sensitivity_analysis/parameter/model_' + str(model_id) +'_' + period +'_sample_' + str(sample) + '.pkl', 'rb') as file:
                model_sample = pickle.load(file)        
            # summarize_results(model_variable)
            model_sample_r2_cs = model_sample.pseudo_rsquared('cs')
            model_sample_r2_mcf = model_sample.pseudo_rsquared('mcf')
            model_sample_llf = model_sample.llf
            model_sample_aic = model_sample.aic
            model_sample_bic = model_sample.bic_llf
            
            r2_cs_difference = model_sample_r2_cs - model_reference_r2_cs
            r2_mcf_difference = model_sample_r2_mcf - model_reference_r2_mcf
            llf_difference = model_sample_llf - model_reference_llf
            aic_difference = model_sample_aic - model_reference_aic
            bic_difference = model_sample_bic - model_reference_bic
            statistic_row = {'model_id':model_id, 'period': period, 'sample':sample, 'r2_cs_difference':r2_cs_difference, 'r2_mcf_difference': r2_mcf_difference, 'llf_difference': llf_difference, 'aic_difference': aic_difference, 'bic_difference': bic_difference}
            new_row = pd.DataFrame([statistic_row])
            df_difference_statistics = pd.concat([df_difference_statistics, new_row], ignore_index=True)
            
            for variable in model_sample.params.index:
                coefficient_difference = model_sample.params[variable] - model_reference.params[variable]
                coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
                rate_ratio_difference = np.exp(model_sample.params[variable]) - np.exp(model_reference.params[variable])
                coefficient_row = {'model_id': model_id, 'period': period, 'sample': sample, 'variable': variable, 'coefficient_difference': coefficient_difference, 'coefficient_relative_difference': coefficient_relative_difference, 'rate_ratio_difference': rate_ratio_difference}
                new_row = pd.DataFrame([coefficient_row])
                df_difference_coefficients = pd.concat([df_difference_coefficients, new_row], ignore_index=True)


*** Period:  2020_1
\*** Model 9
===>Full model:

*** Sample:  0

*** Sample:  1

*** Sample:  2

*** Sample:  3

*** Sample:  4

*** Sample:  5

*** Sample:  6

*** Sample:  7

*** Sample:  8

*** Sample:  9

*** Sample:  10

*** Sample:  11

*** Sample:  12

*** Sample:  13

*** Sample:  14

*** Sample:  15

*** Sample:  16

*** Sample:  17

*** Sample:  18

*** Sample:  19

*** Sample:  20

*** Sample:  21

*** Sample:  22

*** Sample:  23

*** Sample:  24

*** Sample:  25

*** Sample:  26

*** Sample:  27

*** Sample:  28

*** Sample:  29

*** Period:  2020
\*** Model 9
===>Full model:

*** Sample:  0

*** Sample:  1

*** Sample:  2

*** Sample:  3

*** Sample:  4

*** Sample:  5

*** Sample:  6

*** Sample:  7

*** Sample:  8

*** Sample:  9

*** Sample:  10

*** Sample:  11

*** Sample:  12

*** Sample:  13

*** Sample:  14

*** Sample:  15

*** Sample:  16

*** Sample:  17

*** Sample:  18

*** Sample:  19

*** Sample:  20

*** Sample:  21

*** Sample:  22

*** Sample:  23

***

  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  1

*** Sample:  2

*** Sample:  3


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  4

*** Sample:  5

*** Sample:  6


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  7

*** Sample:  8

*** Sample:  9

*** Sample:  10


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  11

*** Sample:  12

*** Sample:  13

*** Sample:  14


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  15

*** Sample:  16

*** Sample:  17

*** Sample:  18


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  19

*** Sample:  20

*** Sample:  21

*** Sample:  22


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  23

*** Sample:  24

*** Sample:  25

*** Sample:  26


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  27

*** Sample:  28

*** Sample:  29

*** Period:  2020
\*** Model 10
===>Full model:

*** Sample:  0

*** Sample:  1

*** Sample:  2


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  3

*** Sample:  4

*** Sample:  5

*** Sample:  6


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  7

*** Sample:  8

*** Sample:  9


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  10

*** Sample:  11

*** Sample:  12


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  13

*** Sample:  14

*** Sample:  15

*** Sample:  16


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  17

*** Sample:  18


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  19

*** Sample:  20


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  21

*** Sample:  22

*** Sample:  23


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  24

*** Sample:  25

*** Sample:  26

*** Sample:  27

*** Sample:  28


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]
  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  29

*** Period:  2021
\*** Model 10
===>Full model:

*** Sample:  0


  coefficient_relative_difference = coefficient_difference / model_reference.params[variable]



*** Sample:  1

*** Sample:  2

*** Sample:  3

*** Sample:  4

*** Sample:  5

*** Sample:  6

*** Sample:  7

*** Sample:  8

*** Sample:  9

*** Sample:  10

*** Sample:  11

*** Sample:  12

*** Sample:  13

*** Sample:  14

*** Sample:  15

*** Sample:  16

*** Sample:  17

*** Sample:  18

*** Sample:  19

*** Sample:  20

*** Sample:  21

*** Sample:  22

*** Sample:  23

*** Sample:  24

*** Sample:  25

*** Sample:  26

*** Sample:  27

*** Sample:  28

*** Sample:  29

*** Period:  2022
\*** Model 10
===>Full model:

*** Sample:  0

*** Sample:  1

*** Sample:  2

*** Sample:  3

*** Sample:  4

*** Sample:  5

*** Sample:  6

*** Sample:  7

*** Sample:  8

*** Sample:  9

*** Sample:  10

*** Sample:  11

*** Sample:  12

*** Sample:  13

*** Sample:  14

*** Sample:  15

*** Sample:  16

*** Sample:  17

*** Sample:  18

*** Sample:  19

*** Sample:  20

*** Sample:  21

*** Sample:  22

*** Sample:  23

*** Sample:  24

*** Sample:  25

*** Sample:  26

*** Sample:  27



In [22]:
df_difference_statistics['r2_cs_difference_absolute'] = df_difference_statistics['r2_cs_difference'].abs()
df_difference_statistics.groupby(['model_id','period'])['r2_cs_difference'].describe().round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
model_id,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9,2020,30.0,-0.0,0.005,-0.009,-0.004,0.0,0.004,0.005
9,2020_1,30.0,-0.001,0.008,-0.017,-0.006,-0.001,0.006,0.013
9,2020_2022,30.0,0.003,0.016,-0.026,-0.004,0.002,0.013,0.053
9,2021,30.0,0.003,0.01,-0.018,-0.001,0.003,0.011,0.022
9,2022,30.0,-0.001,0.012,-0.029,-0.007,-0.001,0.006,0.023
10,2020,30.0,0.002,0.008,-0.017,-0.004,0.002,0.008,0.026
10,2020_1,30.0,0.003,0.009,-0.015,-0.003,0.002,0.009,0.019
10,2020_2022,30.0,0.0,0.012,-0.018,-0.006,-0.003,0.003,0.028
10,2021,30.0,-0.004,0.014,-0.036,-0.012,-0.001,0.006,0.015
10,2022,30.0,-0.005,0.016,-0.035,-0.015,-0.008,0.007,0.03


In [24]:
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

In [30]:
final_summary_df = pd.DataFrame()
for statistic_difference in ['r2_cs_difference','r2_mcf_difference', 'llf_difference', 'aic_difference', 'bic_difference']:
    summary_df = df_difference_statistics.groupby(['model_id', 'period'])[statistic_difference].apply(util.calculate_95_ci).apply(pd.Series).reset_index()
    summary_df.columns = ['model_id', 'period', 'mean', 'lower_ci', 'upper_ci']
    
    summary_df['mean'] = summary_df['mean'].round(3)
    summary_df['lower_ci'] = summary_df['lower_ci'].round(3)
    summary_df['upper_ci'] = summary_df['upper_ci'].round(3)
    
    summary_df['CI'] = summary_df.apply(lambda row: f"{row['mean']}\n({row['lower_ci']}, {row['upper_ci']})", axis=1)
    
    summary_df_pivoted = summary_df[['model_id', 'period','CI']].sort_values('period').pivot(index='model_id', columns='period', values='CI')
    
    summary_df_pivoted = summary_df_pivoted.rename(columns={'2020_1': '2020 (first half)', '2020_2022': '2020-2022'})
    
    summary_df_pivoted = summary_df_pivoted[list_period_labels].reset_index(drop=False)
    summary_df_pivoted['statistic_difference'] = statistic_difference
    
    final_summary_df = pd.concat([final_summary_df, summary_df_pivoted], ignore_index=True)
final_summary_df = final_summary_df[['model_id','statistic_difference', '2020 (first half)', '2020', '2021', '2022', '2020-2022']]
final_summary_df.loc[final_summary_df['statistic_difference'] == 'r2_cs_difference', 'statistic_difference'] = '$R_{CS}^{2}$'
final_summary_df.loc[final_summary_df['statistic_difference'] == 'r2_mcf_difference', 'statistic_difference'] = '$R_{McF}^{2}$'
final_summary_df.loc[final_summary_df['statistic_difference'] == 'llf_difference', 'statistic_difference'] = 'LL'
final_summary_df.loc[final_summary_df['statistic_difference'] == 'aic_difference', 'statistic_difference'] = 'AIC'
final_summary_df.loc[final_summary_df['statistic_difference'] == 'bic_difference', 'statistic_difference'] = 'BIC'
final_summary_df.to_csv('data/output/df_sensitivity_analysis_bootstrap_statistic_difference.csv', index=False)

In [31]:
final_summary_df

period,model_id,statistic_difference,2020 (first half),2020,2021,2022,2020-2022
0,9,$R_{CS}^{2}$,"-0.001\n(-0.004, 0.002)","-0.0\n(-0.002, 0.001)","0.003\n(-0.001, 0.007)","-0.001\n(-0.005, 0.004)","0.003\n(-0.003, 0.009)"
1,10,$R_{CS}^{2}$,"0.003\n(-0.001, 0.006)","0.002\n(-0.001, 0.005)","-0.004\n(-0.009, 0.001)","-0.005\n(-0.011, 0.001)","0.0\n(-0.004, 0.004)"
2,9,$R_{McF}^{2}$,"-0.0\n(-0.001, 0.001)","-0.0\n(-0.0, 0.0)","0.001\n(-0.0, 0.001)","-0.0\n(-0.001, 0.001)","0.001\n(-0.001, 0.002)"
3,10,$R_{McF}^{2}$,"0.001\n(-0.0, 0.002)","0.0\n(-0.0, 0.001)","-0.001\n(-0.003, 0.0)","-0.002\n(-0.004, 0.0)","0.0\n(-0.001, 0.001)"
4,9,LL,"40.952\n(-15.481, 97.384)","2.851\n(-43.847, 49.55)","-37.886\n(-70.282, -5.49)","14.376\n(-28.49, 57.241)","-2.734\n(-45.532, 40.063)"
5,10,LL,"-5.764\n(-53.605, 42.077)","11.607\n(-29.671, 52.885)","-23.684\n(-56.712, 9.344)","-17.178\n(-49.803, 15.446)","43.365\n(9.003, 77.728)"
6,9,AIC,"-81.903\n(-194.768, 30.961)","-5.703\n(-99.1, 87.694)","75.773\n(10.981, 140.565)","-28.751\n(-114.483, 56.98)","5.468\n(-80.127, 91.063)"
7,10,AIC,"11.528\n(-84.155, 107.21)","-23.214\n(-105.77, 59.342)","47.369\n(-18.688, 113.425)","34.357\n(-30.893, 99.606)","-86.731\n(-155.456, -18.006)"
8,9,BIC,"-81.903\n(-194.768, 30.961)","-5.703\n(-99.1, 87.694)","75.773\n(10.981, 140.565)","-28.751\n(-114.483, 56.98)","5.468\n(-80.127, 91.063)"
9,10,BIC,"11.528\n(-84.155, 107.21)","-23.214\n(-105.77, 59.342)","47.369\n(-18.688, 113.425)","34.357\n(-30.893, 99.606)","-86.731\n(-155.456, -18.006)"


In [32]:
df_difference_coefficients['coefficient_relative_difference_absolute'] = df_difference_coefficients['coefficient_relative_difference'].abs()

In [33]:
df_difference_coefficients.groupby(['model_id','period','variable'])[['coefficient_relative_difference_absolute']].mean().round(3).reset_index().sort_values(by=['model_id','period','coefficient_relative_difference_absolute'], ascending=[True, True, False])['variable'].unique()

array(['Semi-urbanized', 'Rural with low human development',
       'Urbanized with informal settlements',
       'Rural with high human development', 'const',
       'percentage_workers_industry', 'percentage_male_population',
       'expected_years_of_schooling_at_age_18',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'demographic_density_in_informal_settlements',
       'percentage_urban_population', 'gini',
       'percentage_workers_services', 'percentage_indigenous_population',
       'percentage_self_employed_workers', 'percentage_workers_commerce',
       'percentage_estimated_households_in_informal_settlements',
       'percentage_votes_for_bolsonaro',
       'density_median_effectively_domiciled_area', 'unemployment_rate',
       'percentage_population_age_range_60_more',
       '% people fully vaccinated'], dtype=object)

In [34]:
df_difference_coefficients['variable'].unique()

array(['const', 'Semi-urbanized', 'Rural with high human development',
       'Urbanized with informal settlements',
       'Rural with low human development',
       'percentage_population_age_range_60_more',
       'percentage_urban_population', 'percentage_male_population',
       'percentage_indigenous_population',
       'density_median_effectively_domiciled_area', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry',
       'expected_years_of_schooling_at_age_18',
       'percentage_votes_for_bolsonaro', '% people fully vaccinated'],
      dtype=object)

In [36]:
summary_df

Unnamed: 0,period,variable,mean,lower_ci,upper_ci,CI
0,2020,% people fully vaccinated,0.000,0.000,0.000,"0.000\n(0.000, 0.000)"
1,2020,const,-0.002,-0.006,0.003,"-0.002\n(-0.006, 0.003)"
2,2020,demographic_density_in_informal_settlements,0.001,-0.001,0.003,"0.001\n(-0.001, 0.003)"
3,2020,density_median_effectively_domiciled_area,-0.003,-0.006,-0.001,"-0.003\n(-0.006, -0.001)"
4,2020,expected_years_of_schooling_at_age_18,-0.003,-0.007,0.001,"-0.003\n(-0.007, 0.001)"
...,...,...,...,...,...,...
85,2022,percentage_votes_for_bolsonaro,-0.003,-0.006,0.001,"-0.003\n(-0.006, 0.001)"
86,2022,percentage_workers_commerce,0.000,-0.004,0.005,"0.000\n(-0.004, 0.005)"
87,2022,percentage_workers_industry,0.003,-0.002,0.007,"0.003\n(-0.002, 0.007)"
88,2022,percentage_workers_services,-0.001,-0.005,0.002,"-0.001\n(-0.005, 0.002)"


In [38]:
summary_df = df_difference_coefficients[df_difference_coefficients['model_id']==10].groupby(['period','variable'])['coefficient_difference'].apply(util.calculate_95_ci).apply(pd.Series).reset_index()

summary_df.columns = ['period','variable', 'mean', 'lower_ci', 'upper_ci']

summary_df['mean'] = summary_df['mean'].apply(lambda x: f"{x:.3f}")
summary_df['lower_ci'] = summary_df['lower_ci'].apply(lambda x: f"{x:.3f}")
summary_df['upper_ci'] = summary_df['upper_ci'].apply(lambda x: f"{x:.3f}")

summary_df['CI'] = summary_df.apply(lambda row: f"{row['mean']}\n({row['lower_ci']}, {row['upper_ci']})", axis=1)

final_summary_df = summary_df[['period','variable','CI']].sort_values('period').pivot(index='variable', columns='period', values='CI')

final_summary_df = final_summary_df.rename(columns={'2020_1': '2020 (first half)', '2020_2022': '2020-2022'})

final_summary_df = final_summary_df[list_period_labels].reset_index()

final_summary_df = final_summary_df.set_index('variable')

final_summary_df = final_summary_df.loc[['const','percentage_population_age_range_60_more',
       'percentage_urban_population', 'density_median_effectively_domiciled_area',
       'percentage_male_population', 'percentage_indigenous_population', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry', 'expected_years_of_schooling_at_age_18',
       '% people fully vaccinated','percentage_votes_for_bolsonaro']].copy()

final_summary_df.index = ['Intercept', '% population 60+ years', '% urban population', 'Median density of effectively domiciled areas (inhabitants/km²)', '% male population', '% indigenous population', 'Gini coefficient', '% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18','% people fully vaccinated','% votes for Bolsonaro']

final_summary_df.to_csv('data/df_sensitivity_analysis_bootstrap_coefficients_difference_model_10.csv', index=True)

In [42]:
summary_df = df_difference_coefficients[df_difference_coefficients['model_id']==9].groupby(['period','variable'])['coefficient_difference'].apply(util.calculate_95_ci).apply(pd.Series).reset_index()

summary_df.columns = ['period','variable', 'mean', 'lower_ci', 'upper_ci']

summary_df['mean'] = summary_df['mean'].apply(lambda x: f"{x:.3f}")
summary_df['lower_ci'] = summary_df['lower_ci'].apply(lambda x: f"{x:.3f}")
summary_df['upper_ci'] = summary_df['upper_ci'].apply(lambda x: f"{x:.3f}")

summary_df['CI'] = summary_df.apply(lambda row: f"{row['mean']}\n({row['lower_ci']}, {row['upper_ci']})", axis=1)

final_summary_df = summary_df[['period','variable','CI']].sort_values('period').pivot(index='variable', columns='period', values='CI')

final_summary_df = final_summary_df.rename(columns={'2020_1': '2020 (first half)', '2020_2022': '2020-2022'})

final_summary_df = final_summary_df[list_period_labels].reset_index()

final_summary_df = final_summary_df.set_index('variable')

final_summary_df = final_summary_df.loc[['const','Semi-urbanized', 'Rural with high human development', 'Urbanized with informal settlements', 'Rural with low human development']].copy()

final_summary_df.index = ['Intercept', 'Semi-urbanized', 'Rural with high human development', 'Urbanized with informal settlements', 'Rural with low human development']

final_summary_df.to_csv('data/df_sensitivity_analysis_bootstrap_coefficients_difference_model_9.csv', index=True)

In [100]:
summary_df = df_difference_coefficients[df_difference_coefficients['model_id']==3].groupby(['period','variable'])['rate_ratio_difference'].apply(util.calculate_95_ci).apply(pd.Series).reset_index()

In [102]:

summary_df.columns = ['period','variable', 'mean', 'lower_ci', 'upper_ci']

summary_df['mean'] = summary_df['mean'].round(3)
summary_df['lower_ci'] = summary_df['lower_ci'].round(3)
summary_df['upper_ci'] = summary_df['upper_ci'].round(3)

summary_df['CI'] = summary_df.apply(lambda row: f"{row['mean']}\n({row['lower_ci']}, {row['upper_ci']})", axis=1)

final_summary_df = summary_df[['period','variable','CI']].sort_values('period').pivot(index='variable', columns='period', values='CI')

final_summary_df = final_summary_df.rename(columns={'2020_1': '2020 (first half)', '2020_2022': '2020-2022'})

final_summary_df = final_summary_df[list_period_labels].reset_index()

final_summary_df = final_summary_df.set_index('variable')

In [103]:
final_summary_df

period,2020 (first half),2020,2021,2022,2020-2022
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Rural with high human development,"-0.003\n(-0.01, 0.005)","0.003\n(-0.001, 0.007)","0.001\n(-0.002, 0.003)","-0.002\n(-0.006, 0.003)","0.001\n(-0.001, 0.004)"
Rural with low human development,"-0.005\n(-0.009, -0.0)","-0.0\n(-0.002, 0.002)","0.0\n(-0.002, 0.002)","0.003\n(-0.001, 0.006)","0.002\n(-0.0, 0.004)"
Semi-urbanized,"0.002\n(-0.003, 0.007)","-0.001\n(-0.003, 0.001)","0.001\n(-0.001, 0.002)","-0.001\n(-0.003, 0.001)","-0.0\n(-0.001, 0.0)"
Urbanized,"0.003\n(-0.001, 0.007)","-0.001\n(-0.003, 0.001)","-0.001\n(-0.003, 0.001)","0.0\n(-0.003, 0.003)","-0.001\n(-0.002, 0.0)"
Urbanized with informal settlements,"-0.001\n(-0.005, 0.004)","-0.001\n(-0.003, 0.001)","-0.0\n(-0.002, 0.001)","0.0\n(-0.002, 0.003)","-0.001\n(-0.003, 0.001)"
const,"-0.0\n(-0.0, 0.0)","0.0\n(-0.0, 0.0)","0.0\n(-0.0, 0.0)","0.0\n(-0.0, 0.0)","0.0\n(-0.0, 0.0)"
delta_first_death_period,"0.017\n(-0.0, 0.035)","0.003\n(-0.005, 0.012)","0.003\n(-0.002, 0.008)","-0.015\n(-0.03, 0.0)","0.001\n(-0.002, 0.004)"


In [104]:
final_summary_df = final_summary_df.loc[['const',
                                         'Urbanized',
                                         'Urbanized with informal settlements',
                                         'Semi-urbanized',
                                         'Rural with high human development',
                                         'Rural with low human development',
                                         'delta_first_death_period']].copy()

final_summary_df.to_csv('data/df_sensitivity_analysis_bootstrap_rate_ratio_difference_model_3.csv', index=True)

## Outlier Sensitivity Analysis

### Model 9: Mortality rate ratio using 'Urbanized' as the reference group

In [49]:
df_analysis_cluster = pd.DataFrame()
df_analysis_cluster['Sociodemographic cluster'] = ['Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development','DF','Residual DF','Deviance',"\chi^2",'$R_{CS}^{2}$','$R_{McF}^{2}$','LL','AIC', 'BIC']
df_analysis_cluster = df_analysis_cluster.set_index('Sociodemographic cluster')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    model = 9
    model_label = str(model)
    model_file = str(model)
    with open('models/sensitivity_analysis/outliers/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)

    # Extract the coefficients and standard errors
    params = model.params[1:5]
    conf = model.conf_int()[1:5]
    conf.columns = ['Lower CI', 'Upper CI']
    
    # Calculate the rate ratios and their confidence intervals
    rate_ratios = np.exp(params)
    conf['Lower CI'] = np.exp(conf['Lower CI'])
    conf['Upper CI'] = np.exp(conf['Upper CI'])
    
    # Combine into a single DataFrame
    rate_ratio_df = pd.DataFrame({
        'Rate Ratio': rate_ratios,
        'Lower CI': conf['Lower CI'],
        'Upper CI': conf['Upper CI']
    }) 
    
    rate_ratio_df.index = rate_ratio_df.index.astype(str).str.replace('cluster_label_','')
    
    rate_ratio_df = rate_ratio_df.round(2)
    
    column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
    df_analysis_cluster[column_rate_ratio] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'        
    
    df_analysis_cluster.loc['Residual DF', column_rate_ratio] = model.df_resid
    df_analysis_cluster.loc['Deviance', column_rate_ratio] = round(model.deviance,2)
    df_analysis_cluster.loc["\chi^2", column_rate_ratio] = round(model.pearson_chi2,2)
    df_analysis_cluster.loc['$R_{CS}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('cs'),2)
    df_analysis_cluster.loc['$R_{McF}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('mcf'),2)
    df_analysis_cluster.loc['$R_{McF}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('mcf'),2)
    df_analysis_cluster.loc['LL', column_rate_ratio] = int(round(model.llf,0))
    df_analysis_cluster.loc['AIC', column_rate_ratio] = int(round(model.aic,0))
    df_analysis_cluster.loc['BIC', column_rate_ratio] = int(round(model.bic_llf,0))
    df_analysis_cluster.loc['DF', column_rate_ratio] = round(model.df_model)

    df_analysis_cluster[column_rate_ratio] = df_analysis_cluster[column_rate_ratio].fillna('1 [Reference]')

df_analysis_cluster.to_csv('data/output/df_analysis_cluster_outlier_sensitivity_9.csv', index=True)

### Model 10: Variables

In [52]:
df_results = pd.DataFrame()
df_results['Variable'] = ['const','percentage_population_age_range_60_more',
       'percentage_urban_population', 'density_median_effectively_domiciled_area',
       'percentage_male_population', 'percentage_indigenous_population', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry', 'expected_years_of_schooling_at_age_18',
       '% people fully vaccinated','percentage_votes_for_bolsonaro','DF','Residual DF','Deviance',"\chi^2",'$R_{CS}^{2}$','$R_{McF}^{2}$','LL','AIC', 'BIC']
df_results = df_results.set_index('Variable')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    model = 10
    model_label = str(model)
    model_file = str(model)
    with open('models/sensitivity_analysis/outliers/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)

    # Extract the coefficients and standard errors
    params = model.params[:]
    conf = model.conf_int()[:]
    conf.columns = ['Lower CI', 'Upper CI']

    # Calculate the rate ratios and their confidence intervals
    rate_ratios = np.exp(params)
    conf['Lower CI'] = np.exp(conf['Lower CI'])
    conf['Upper CI'] = np.exp(conf['Upper CI'])

    # Combine into a single DataFrame
    rate_ratio_df = pd.DataFrame({
        'Rate Ratio': rate_ratios,
        'Lower CI': conf['Lower CI'],
        'Upper CI': conf['Upper CI']
    })

    rate_ratio_df.index = rate_ratio_df.index.astype(str).str.replace('cluster_label_','')

    rate_ratio_df = rate_ratio_df.round(2)

    column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
    df_results[column_rate_ratio] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'

    df_results.loc['Residual DF', column_rate_ratio] = model.df_resid
    df_results.loc['Deviance', column_rate_ratio] = round(model.deviance,2)
    df_results.loc["\chi^2", column_rate_ratio] = round(model.pearson_chi2,2)
    df_results.loc['$R_{CS}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('cs'),2)
    df_results.loc['$R_{McF}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('mcf'),2)
    df_results.loc['$R_{McF}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('mcf'),2)
    df_results.loc['LL', column_rate_ratio] = int(round(model.llf,0))
    df_results.loc['AIC', column_rate_ratio] = int(round(model.aic,0))
    df_results.loc['BIC', column_rate_ratio] = int(round(model.bic_llf,0))
    df_results.loc['DF', column_rate_ratio] = round(model.df_model)

df_results.index = ['Intercept', '% population 60+ years', '% urban population', 'Median density of effectively domiciled areas (inhabitants/km²)', '% male population', '% indigenous population', 'Gini coefficient', '% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18','% people fully vaccinated','% votes for Bbolsonaro','DF','Residual DF','Deviance',"\chi^2",'$R_{CS}^{2}$','$R_{McF}^{2}$','LL','AIC', 'BIC']

df_results.to_csv('data/output/df_analysis_model_10_sensitivity_analysis.csv', index=True)