# Statistical analysis

In [1]:
import pandas as pd
import numpy as np
import pickle

from util import calculate_95_ci

## Loading data

In [2]:
df_deaths = pd.read_csv('data/output/df_mortality.csv', index_col=0)
df_labeled_cluster = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)
df_deaths['cluster_label'] = df_labeled_cluster['cluster_label']

## Mortality analysis by cluster

### Mortality rate ratio using 'Urbanized' as the reference group

In [3]:
df_analysis_cluster = pd.DataFrame()
df_analysis_cluster['Sociodemographic cluster'] = ['Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development','Residual degrees of freedom','Deviance','Pearson chi-squared','$R_{CU}^{2}$','AIC']
df_analysis_cluster = df_analysis_cluster.set_index('Sociodemographic cluster')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]
    
    mortality_rate = df_deaths.groupby('cluster_label')[death_rate_column].mean()
    ci_data = df_deaths.groupby('cluster_label')[death_rate_column].apply(calculate_95_ci)
    df_analysis_cluster[['Mean', 'Lower CI', 'Upper CI']] = pd.DataFrame(ci_data.tolist(), index=ci_data.index).round(2)
    df_analysis_cluster[period_label+' - Death rate (95% CI)'] = df_analysis_cluster['Mean'].astype(str) + '\n' + '('+df_analysis_cluster['Lower CI'].astype(str) + '-'+df_analysis_cluster['Upper CI'].astype(str) + ')'
    df_analysis_cluster = df_analysis_cluster.drop(columns=['Mean', 'Lower CI', 'Upper CI'])
    
    for model in [1,2]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[1:5]
        conf = model.conf_int()[1:5]
        conf.columns = ['Lower CI', 'Upper CI']
        
        # Calculate the rate ratios and their confidence intervals
        rate_ratios = np.exp(params)
        conf['Lower CI'] = np.exp(conf['Lower CI'])
        conf['Upper CI'] = np.exp(conf['Upper CI'])
        
        # Combine into a single DataFrame
        rate_ratio_df = pd.DataFrame({
            'Rate Ratio': rate_ratios,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        }) 
        
        rate_ratio_df.index = rate_ratio_df.index.astype(str).str.replace('cluster_label_','')
        
        rate_ratio_df = rate_ratio_df.round(2)
        
        column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
        df_analysis_cluster[column_rate_ratio] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'        
        
        df_analysis_cluster.loc['Residual degrees of freedom', column_rate_ratio] = model.df_resid
        df_analysis_cluster.loc['Deviance', column_rate_ratio] = round(model.deviance,2)
        df_analysis_cluster.loc['Pearson chi-squared', column_rate_ratio] = round(model.pearson_chi2,2)
        df_analysis_cluster.loc['$R_{CU}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('cs'),2)
        df_analysis_cluster.loc['AIC', column_rate_ratio] = round(model.aic,2)
        
        
        df_analysis_cluster[column_rate_ratio] = df_analysis_cluster[column_rate_ratio].fillna('1 [Reference]')

df_analysis_cluster.to_csv('data/output/df_analysis_cluster.csv', index=True)

In [4]:
df_analysis_cluster

Unnamed: 0_level_0,2020 (first half) - Death rate (95% CI),2020 (first half) - RR (95% CI) - Model 1,2020 (first half) - RR (95% CI) - Model 2,2020 - Death rate (95% CI),2020 - RR (95% CI) - Model 1,2020 - RR (95% CI) - Model 2,2021 - Death rate (95% CI),2021 - RR (95% CI) - Model 1,2021 - RR (95% CI) - Model 2,2022 - Death rate (95% CI),2022 - RR (95% CI) - Model 1,2022 - RR (95% CI) - Model 2,2020-2022 - Death rate (95% CI),2020-2022 - RR (95% CI) - Model 1,2020-2022 - RR (95% CI) - Model 2
Sociodemographic cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Urbanized,9.24\n(8.56-9.92),1 [Reference],1 [Reference],68.86\n(66.9-70.83),1 [Reference],1 [Reference],243.29\n(239.23-247.35),1 [Reference],1 [Reference],41.82\n(40.58-43.07),1 [Reference],1 [Reference],353.97\n(348.7-359.25),1 [Reference],1 [Reference]
Urbanized with informal settlements,47.19\n(42.74-51.64),1.36\n(1.13-1.63),1.01\n(0.87-1.16),115.8\n(110.31-121.3),1.06\n(0.96-1.16),1.04\n(0.96-1.13),192.68\n(184.22-201.14),0.79\n(0.73-0.84),0.81\n(0.76-0.86),27.89\n(26.19-29.6),0.9\n(0.83-0.97),0.95\n(0.89-1.01),336.37\n(323.17-349.58),0.85\n(0.8-0.9),0.85\n(0.8-0.9)
Semi-urbanized,17.88\n(16.83-18.93),1.87\n(1.63-2.14),1.46\n(1.29-1.65),63.41\n(61.57-65.26),1.23\n(1.15-1.31),1.13\n(1.07-1.2),123.63\n(120.71-126.54),0.73\n(0.7-0.76),0.76\n(0.72-0.79),22.93\n(22.13-23.74),0.84\n(0.79-0.89),0.88\n(0.83-0.93),209.97\n(206.09-213.85),0.84\n(0.81-0.87),0.82\n(0.79-0.85)
Rural with high human development,4.58\n(3.72-5.43),1.01\n(0.83-1.23),1.21\n(1.0-1.46),52.26\n(48.76-55.77),1.08\n(1.0-1.18),1.22\n(1.13-1.32),193.64\n(186.67-200.62),0.95\n(0.9-1.0),0.98\n(0.93-1.03),37.97\n(35.51-40.43),0.93\n(0.86-1.01),1.03\n(0.95-1.11),283.88\n(275.03-292.72),1.0\n(0.95-1.04),1.03\n(0.99-1.08)
Rural with low human development,30.19\n(27.0-33.39),2.65\n(2.12-3.32),1.47\n(1.21-1.79),64.4\n(59.79-69.01),1.29\n(1.15-1.44),1.06\n(0.95-1.17),83.94\n(78.38-89.51),0.58\n(0.54-0.63),0.61\n(0.56-0.66),11.35\n(10.2-12.5),0.64\n(0.57-0.73),0.74\n(0.66-0.83),159.69\n(150.64-168.75),0.74\n(0.69-0.79),0.7\n(0.65-0.74)
Residual degrees of freedom,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538
Deviance,nan\n(nan-nan),5337.7,4527.98,nan\n(nan-nan),6233.4,5658.02,nan\n(nan-nan),5867.71,5789.12,nan\n(nan-nan),6275.74,4681.19,nan\n(nan-nan),5715.17,5637.35
Pearson chi-squared,nan\n(nan-nan),6892.1,5781.98,nan\n(nan-nan),5856.69,5790.51,nan\n(nan-nan),5831.72,5930.56,nan\n(nan-nan),5739.46,5094.09,nan\n(nan-nan),5655.8,5728.63
$R_{CU}^{2}$,nan\n(nan-nan),0.33,0.64,nan\n(nan-nan),0.2,0.44,nan\n(nan-nan),0.53,0.63,nan\n(nan-nan),0.35,0.62,nan\n(nan-nan),0.5,0.57
AIC,nan\n(nan-nan),20391.05,18297.25,nan\n(nan-nan),32933.61,31365.19,nan\n(nan-nan),40064.49,39342.44,nan\n(nan-nan),24906.0,22587.67,nan\n(nan-nan),43358.52,42839.08


### Rate ratio for the cluster probabilities

#### Model 3

In [5]:
df_analysis_cluster_probabilities = pd.DataFrame()
df_analysis_cluster_probabilities['Variable'] = ['Urbanized (probability)','Urbanized with informal settlements (probability)','Semi-urbanized (probability)','Rural with high human development (probability)','Rural with low human development (probability)','Days since first death','Residual degrees of freedom','Deviance','Pearson chi-squared','$R_{CU}^{2}$','AIC']
df_analysis_cluster_probabilities = df_analysis_cluster_probabilities.set_index('Variable')

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    
    with open('models/model_3_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)

    # Extract the coefficients and standard errors
    params = model.params[1:]
    conf = model.conf_int()[1:]
    conf.columns = ['Lower CI', 'Upper CI']
    
    # Calculate the rate ratios and their confidence intervals
    rate_ratios = np.exp(params)
    conf['Lower CI'] = np.exp(conf['Lower CI'])
    conf['Upper CI'] = np.exp(conf['Upper CI'])
    
    # Combine into a single DataFrame
    rate_ratio_df = pd.DataFrame({
        'Rate Ratio': rate_ratios,
        'Lower CI': conf['Lower CI'],
        'Upper CI': conf['Upper CI']
    })
    
    rate_ratio_df = rate_ratio_df.round(2)
    
    rate_ratio_df.index = rate_ratio_df.index.astype(str) + ' (probability)'
    index_list = rate_ratio_df.index.tolist()
    index_list[-1] = 'Days since first death'
    rate_ratio_df.index = index_list
    
    df_analysis_cluster_probabilities[period_label] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'
    
    df_analysis_cluster_probabilities.loc['Residual degrees of freedom', period_label] = model.df_resid
    df_analysis_cluster_probabilities.loc['Deviance', period_label] = round(model.deviance,2)
    df_analysis_cluster_probabilities.loc['Pearson chi-squared', period_label] = round(model.pearson_chi2,2)    
    df_analysis_cluster_probabilities.loc['$R_{CU}^{2}$', period_label] = round(model.pseudo_rsquared('cs'),2)
    df_analysis_cluster_probabilities.loc['AIC', period_label] = round(model.aic,2)
    
df_analysis_cluster_probabilities.to_csv('data/output/df_analysis_cluster_probabilities.csv', index=True)

In [6]:
df_analysis_cluster_probabilities

Unnamed: 0_level_0,2020 (first half),2020,2021,2022,2020-2022
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Urbanized (probability),0.82\n(0.8-0.84),1.0\n(0.99-1.01),1.17\n(1.16-1.18),1.11\n(1.1-1.12),1.14\n(1.13-1.14)
Urbanized with informal settlements (probability),1.02\n(1.0-1.04),1.03\n(1.02-1.04),1.0\n(1.0-1.01),0.94\n(0.93-0.95),1.02\n(1.01-1.03)
Semi-urbanized (probability),1.16\n(1.14-1.19),0.98\n(0.97-0.99),0.89\n(0.88-0.89),0.91\n(0.89-0.92),0.9\n(0.9-0.91)
Rural with high human development (probability),0.93\n(0.89-0.98),1.05\n(1.03-1.07),1.07\n(1.06-1.08),1.2\n(1.18-1.23),1.06\n(1.05-1.07)
Rural with low human development (probability),1.15\n(1.12-1.17),0.95\n(0.94-0.97),0.86\n(0.85-0.87),0.82\n(0.8-0.83),0.88\n(0.87-0.88)
Days since first death,2.49\n(2.41-2.58),1.67\n(1.63-1.71),1.26\n(1.24-1.28),2.26\n(2.16-2.37),1.19\n(1.17-1.21)
Residual degrees of freedom,5554,5554,5554,5554,5554
Deviance,4559.35,5748.75,5715.11,4665.64,5579.56
Pearson chi-squared,5650.48,5978.82,5825.26,5304.86,5642.19
$R_{CU}^{2}$,0.6,0.4,0.59,0.52,0.51


In [7]:
df_analysis_cluster

Unnamed: 0_level_0,2020 (first half) - Death rate (95% CI),2020 (first half) - RR (95% CI) - Model 1,2020 (first half) - RR (95% CI) - Model 2,2020 - Death rate (95% CI),2020 - RR (95% CI) - Model 1,2020 - RR (95% CI) - Model 2,2021 - Death rate (95% CI),2021 - RR (95% CI) - Model 1,2021 - RR (95% CI) - Model 2,2022 - Death rate (95% CI),2022 - RR (95% CI) - Model 1,2022 - RR (95% CI) - Model 2,2020-2022 - Death rate (95% CI),2020-2022 - RR (95% CI) - Model 1,2020-2022 - RR (95% CI) - Model 2
Sociodemographic cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Urbanized,9.24\n(8.56-9.92),1 [Reference],1 [Reference],68.86\n(66.9-70.83),1 [Reference],1 [Reference],243.29\n(239.23-247.35),1 [Reference],1 [Reference],41.82\n(40.58-43.07),1 [Reference],1 [Reference],353.97\n(348.7-359.25),1 [Reference],1 [Reference]
Urbanized with informal settlements,47.19\n(42.74-51.64),1.36\n(1.13-1.63),1.01\n(0.87-1.16),115.8\n(110.31-121.3),1.06\n(0.96-1.16),1.04\n(0.96-1.13),192.68\n(184.22-201.14),0.79\n(0.73-0.84),0.81\n(0.76-0.86),27.89\n(26.19-29.6),0.9\n(0.83-0.97),0.95\n(0.89-1.01),336.37\n(323.17-349.58),0.85\n(0.8-0.9),0.85\n(0.8-0.9)
Semi-urbanized,17.88\n(16.83-18.93),1.87\n(1.63-2.14),1.46\n(1.29-1.65),63.41\n(61.57-65.26),1.23\n(1.15-1.31),1.13\n(1.07-1.2),123.63\n(120.71-126.54),0.73\n(0.7-0.76),0.76\n(0.72-0.79),22.93\n(22.13-23.74),0.84\n(0.79-0.89),0.88\n(0.83-0.93),209.97\n(206.09-213.85),0.84\n(0.81-0.87),0.82\n(0.79-0.85)
Rural with high human development,4.58\n(3.72-5.43),1.01\n(0.83-1.23),1.21\n(1.0-1.46),52.26\n(48.76-55.77),1.08\n(1.0-1.18),1.22\n(1.13-1.32),193.64\n(186.67-200.62),0.95\n(0.9-1.0),0.98\n(0.93-1.03),37.97\n(35.51-40.43),0.93\n(0.86-1.01),1.03\n(0.95-1.11),283.88\n(275.03-292.72),1.0\n(0.95-1.04),1.03\n(0.99-1.08)
Rural with low human development,30.19\n(27.0-33.39),2.65\n(2.12-3.32),1.47\n(1.21-1.79),64.4\n(59.79-69.01),1.29\n(1.15-1.44),1.06\n(0.95-1.17),83.94\n(78.38-89.51),0.58\n(0.54-0.63),0.61\n(0.56-0.66),11.35\n(10.2-12.5),0.64\n(0.57-0.73),0.74\n(0.66-0.83),159.69\n(150.64-168.75),0.74\n(0.69-0.79),0.7\n(0.65-0.74)
Residual degrees of freedom,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538,nan\n(nan-nan),5539,5538
Deviance,nan\n(nan-nan),5337.7,4527.98,nan\n(nan-nan),6233.4,5658.02,nan\n(nan-nan),5867.71,5789.12,nan\n(nan-nan),6275.74,4681.19,nan\n(nan-nan),5715.17,5637.35
Pearson chi-squared,nan\n(nan-nan),6892.1,5781.98,nan\n(nan-nan),5856.69,5790.51,nan\n(nan-nan),5831.72,5930.56,nan\n(nan-nan),5739.46,5094.09,nan\n(nan-nan),5655.8,5728.63
$R_{CU}^{2}$,nan\n(nan-nan),0.33,0.64,nan\n(nan-nan),0.2,0.44,nan\n(nan-nan),0.53,0.63,nan\n(nan-nan),0.35,0.62,nan\n(nan-nan),0.5,0.57
AIC,nan\n(nan-nan),20391.05,18297.25,nan\n(nan-nan),32933.61,31365.19,nan\n(nan-nan),40064.49,39342.44,nan\n(nan-nan),24906.0,22587.67,nan\n(nan-nan),43358.52,42839.08


#### Model 4 and 5

In [27]:
df_analysis_models_4_5 = pd.DataFrame()
df_analysis_models_4_5['Variables'] = ['const','Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development','percentage_population_age_range_60_more',
       'percentage_urban_population', 'demographic_density',
       'percentage_male_population', 'percentage_indigenous_population',
       'per_capita_income', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry', 'expected_years_of_schooling_at_age_18',
       'delta_first_death_period','Residual degrees of freedom','Deviance',"Pearson's chi-squared",'Cragg-Uhler R²', 'McFadden R²','Akaike Information Criterion (AIC)']
df_analysis_models_4_5 = df_analysis_models_4_5.set_index('Variables')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]
    
    mortality_rate = df_deaths.groupby('cluster_label')[death_rate_column].mean()
    ci_data = df_deaths.groupby('cluster_label')[death_rate_column].apply(calculate_95_ci)
    df_analysis_models_4_5[['Mean', 'Lower CI', 'Upper CI']] = pd.DataFrame(ci_data.tolist(), index=ci_data.index).round(2)
    df_analysis_models_4_5[period_label+' - Death rate (95% CI)'] = df_analysis_models_4_5['Mean'].astype(str) + '\n' + '('+df_analysis_models_4_5['Lower CI'].astype(str) + '-'+df_analysis_models_4_5['Upper CI'].astype(str) + ')'
    df_analysis_models_4_5 = df_analysis_models_4_5.drop(columns=['Mean', 'Lower CI', 'Upper CI'])
    
    for model in [6,4,5]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']
        
        # Calculate the rate ratios and their confidence intervals
        rate_ratios = np.exp(params)
        conf['Lower CI'] = np.exp(conf['Lower CI'])
        conf['Upper CI'] = np.exp(conf['Upper CI'])
        
        # Combine into a single DataFrame
        rate_ratio_df = pd.DataFrame({
            'Rate Ratio': rate_ratios,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        }) 
        
        rate_ratio_df = rate_ratio_df.round(2)
        
        column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
        df_analysis_models_4_5[column_rate_ratio] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'        
        
        df_analysis_models_4_5.loc['Residual degrees of freedom', column_rate_ratio] = model.df_resid
        df_analysis_models_4_5.loc['Deviance', column_rate_ratio] = round(model.deviance,2)
        df_analysis_models_4_5.loc["Pearson's chi-squared", column_rate_ratio] = round(model.pearson_chi2,2)
        df_analysis_models_4_5.loc['Cragg-Uhler R²', column_rate_ratio] = round(model.pseudo_rsquared('cs'),2)
        df_analysis_models_4_5.loc['McFadden R²', column_rate_ratio] = round(model.pseudo_rsquared('mcf'),2)
        df_analysis_models_4_5.loc['Akaike Information Criterion (AIC)', column_rate_ratio] = round(model.aic,2)

df_analysis_models_4_5.index = ['Intercept','Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development','% population 60+ years','% urban population','Population density (inhabitants/km²)','% male population','% indigenous population','Per capita income (BRL)','Gini coefficient','% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18', 'Days since first death','Residual degrees of freedom','Deviance',"Pearson's chi-squared",'Cragg-Uhler R²', 'McFadden R²','Akaike Information Criterion (AIC)']

df_analysis_models_4_5.to_csv('data/output/df_analysis_models_4_5.csv', index=True)

In [24]:
params

const                                                        -5.953647
Semi-urbanized                                               -0.052944
Rural with high human development                             0.077273
Urbanized with informal settlements                          -0.037258
Rural with low human development                             -0.090941
Urbanized                                                     0.065097
percentage_population_age_range_60_more                       0.062248
percentage_urban_population                                   0.119591
demographic_density                                           0.005254
percentage_male_population                                    0.026923
percentage_indigenous_population                              0.041404
per_capita_income                                            -0.010168
gini                                                          0.009866
percentage_estimated_households_in_informal_settlements       0.019257
demogr

## Analysis of the sociodemographic variables

In [9]:
df_analysis_variables = pd.DataFrame()

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    
    with open('models/model_2_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)

    # Extract the coefficients and standard errors
    params = model.params[5:]
    conf = model.conf_int()[5:]
    conf.columns = ['Lower CI', 'Upper CI']
    
    # Calculate the rate ratios and their confidence intervals
    rate_ratios = np.exp(params)
    conf['Lower CI'] = np.exp(conf['Lower CI'])
    conf['Upper CI'] = np.exp(conf['Upper CI'])
    
    # Combine into a single DataFrame
    rate_ratio_df = pd.DataFrame({
        'Rate Ratio': rate_ratios,
        'Lower CI': conf['Lower CI'],
        'Upper CI': conf['Upper CI']
    })
    
    rate_ratio_df = rate_ratio_df.round(2)
    
    df_analysis_variables[period_label] = rate_ratio_df['Rate Ratio'].astype(str) + '\n' + '('+rate_ratio_df['Lower CI'].astype(str) + '-'+rate_ratio_df['Upper CI'].astype(str) + ')'
    
df_analysis_variables['Variable'] = ['% population 60+ years','% urban population','Population density (inhabitants/km²)','% male population','% indigenous population','Per capita income (BRL)','Gini coefficient','% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18', 'Days since first death']

df_analysis_variables = df_analysis_variables.set_index('Variable')  

df_analysis_variables.to_csv('data/output/df_analysis_variables.csv', index=True)

In [10]:
df_analysis_variables

Unnamed: 0_level_0,2020 (first half),2020,2021,2022,2020-2022
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
% population 60+ years,0.92\n(0.88-0.97),1.09\n(1.07-1.12),1.09\n(1.07-1.11),1.35\n(1.32-1.37),1.1\n(1.08-1.11)
% urban population,1.07\n(1.01-1.14),1.11\n(1.07-1.14),1.13\n(1.1-1.15),1.09\n(1.06-1.13),1.13\n(1.11-1.15)
Population density (inhabitants/km²),1.03\n(1.01-1.06),1.01\n(0.99-1.02),0.99\n(0.98-1.01),0.99\n(0.98-1.0),1.0\n(0.99-1.01)
% male population,0.99\n(0.95-1.03),1.02\n(1.0-1.04),1.06\n(1.04-1.07),1.07\n(1.05-1.09),1.04\n(1.02-1.05)
% indigenous population,1.02\n(1.0-1.05),1.05\n(1.03-1.06),1.03\n(1.02-1.04),1.03\n(1.01-1.05),1.04\n(1.03-1.05)
Per capita income (BRL),0.74\n(0.7-0.78),0.94\n(0.92-0.97),1.02\n(1.0-1.04),1.03\n(1.01-1.06),1.01\n(0.99-1.02)
Gini coefficient,1.03\n(0.98-1.07),0.98\n(0.96-1.0),0.99\n(0.98-1.01),0.97\n(0.95-0.99),0.99\n(0.98-1.01)
% informal settlement households,1.09\n(1.06-1.12),1.04\n(1.02-1.05),1.0\n(0.99-1.01),0.99\n(0.98-1.0),1.01\n(1.0-1.02)
Population density in informal settlement (inhabitants/ha),0.99\n(0.96-1.01),1.0\n(0.98-1.01),1.01\n(1.0-1.02),1.0\n(0.98-1.01),1.01\n(1.0-1.02)
% sanitation-related hospitalizations,1.0\n(0.97-1.03),0.98\n(0.97-1.0),1.0\n(0.99-1.01),0.98\n(0.96-1.0),0.98\n(0.97-0.99)


In [11]:
with open('models/model_1_2020_1.pkl', 'rb') as file:
    model = pickle.load(file)
print(model.summary())

                           Generalized Linear Model Regression Results                            
Dep. Variable:     deaths_accumulated_first_semester_2020   No. Observations:                 5560
Model:                                                GLM   Df Residuals:                     5539
Model Family:                            NegativeBinomial   Df Model:                           20
Link Function:                                        Log   Scale:                          1.0000
Method:                                              IRLS   Log-Likelihood:                -10175.
Date:                                    Mon, 17 Jun 2024   Deviance:                       5337.7
Time:                                            17:08:33   Pearson chi2:                 6.89e+03
No. Iterations:                                        13   Pseudo R-squ. (CS):             0.3332
Covariance Type:                                nonrobust                                         
          

In [12]:
with open('models/model_2_2020_1.pkl', 'rb') as file:
    model = pickle.load(file)
print(model.summary())

                           Generalized Linear Model Regression Results                            
Dep. Variable:     deaths_accumulated_first_semester_2020   No. Observations:                 5560
Model:                                                GLM   Df Residuals:                     5538
Model Family:                            NegativeBinomial   Df Model:                           21
Link Function:                                        Log   Scale:                          1.0000
Method:                                              IRLS   Log-Likelihood:                -9126.6
Date:                                    Mon, 17 Jun 2024   Deviance:                       4528.0
Time:                                            17:08:34   Pearson chi2:                 5.78e+03
No. Iterations:                                        13   Pseudo R-squ. (CS):             0.6433
Covariance Type:                                nonrobust                                         
          

In [13]:
with open('models/model_1_2020.pkl', 'rb') as file:
    model = pickle.load(file)
print(model.summary())

                    Generalized Linear Model Regression Results                    
Dep. Variable:     deaths_accumulated_2020   No. Observations:                 5560
Model:                                 GLM   Df Residuals:                     5539
Model Family:             NegativeBinomial   Df Model:                           20
Link Function:                         Log   Scale:                          1.0000
Method:                               IRLS   Log-Likelihood:                -16446.
Date:                     Mon, 17 Jun 2024   Deviance:                       6233.4
Time:                             17:08:34   Pearson chi2:                 5.86e+03
No. Iterations:                          8   Pseudo R-squ. (CS):             0.1972
Covariance Type:                 nonrobust                                         
                                                                 coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------

In [14]:
with open('models/model_1_2021.pkl', 'rb') as file:
    model = pickle.load(file)
print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:            deaths_2021   No. Observations:                 5560
Model:                            GLM   Df Residuals:                     5539
Model Family:        NegativeBinomial   Df Model:                           20
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -20011.
Date:                Mon, 17 Jun 2024   Deviance:                       5867.7
Time:                        17:08:34   Pearson chi2:                 5.83e+03
No. Iterations:                    10   Pseudo R-squ. (CS):             0.5301
Covariance Type:            nonrobust                                         
                                                                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [16]:
df_deaths['deaths'].sum()

701736.0

In [17]:
df_deaths[df_deaths['cluster_label']=='Urbanized with informal settlements']['deaths'].sum()

363523.0

In [18]:
df_deaths[df_deaths['cluster_label']=='Urbanized']['deaths'].sum()

236607.0

In [21]:
df_deaths['population'].sum()

202760341

In [22]:
df_deaths[df_deaths['cluster_label']=='Urbanized with informal settlements']['population'].sum()

92394320

In [23]:
df_deaths[df_deaths['cluster_label']=='Urbanized']['population'].sum()

63741508

In [25]:
df_deaths['deaths_2021'].sum()

423977.0