# Statistical analysis

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import pickle
from scipy.stats import pearsonr, spearmanr

import util

## Loading data

In [2]:
df_deaths = pd.read_csv('data/output/df_mortality.csv', index_col=0)
df_labeled_cluster = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)
df_deaths['cluster_label'] = df_labeled_cluster['cluster_label']

In [3]:
df_deaths

Unnamed: 0,code_municipality_6,code_municipality_7,population,date,region,deaths,new_deaths,deaths_accumulated_first_semester_2020,delta_first_death_2020-06-30,deaths_accumulated_2020,...,"Death rate (July-August, 2021)","Death rate (September-October, 2021)","Death rate (November-December, 2021)","Death rate (January-February, 2022)","Death rate (March-April, 2022)","Death rate (May-June, 2022)","Death rate (July-August, 2022)","Death rate (September-October, 2022)","Death rate (November-December, 2022)",cluster_label
0,110001,1100015,21495,2022-12-31,North,93.0,0.0,0.0,0.0,16.0,...,9.304489,9.304489,23.261224,27.913468,9.304489,4.652245,0.000000,4.652245,4.652245,Semi-urbanized
1,110002,1100023,96833,2022-12-31,North,540.0,0.0,20.0,44.0,122.0,...,22.719527,7.228941,29.948468,17.555998,5.163529,0.000000,14.457881,0.000000,1.032706,Urbanized
2,110003,1100031,5363,2022-12-31,North,15.0,0.0,1.0,0.0,4.0,...,18.646280,18.646280,0.000000,0.000000,0.000000,18.646280,0.000000,0.000000,0.000000,Rural with high human development
3,110004,1100049,86895,2022-12-31,North,351.0,0.0,6.0,85.0,60.0,...,20.714656,10.357328,19.563841,13.809770,5.754071,1.150814,6.904885,0.000000,1.150814,Urbanized
4,110005,1100056,15890,2022-12-31,North,76.0,0.0,2.0,41.0,8.0,...,12.586532,18.879799,12.586532,44.052863,0.000000,6.293266,6.293266,0.000000,6.293266,Urbanized
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,522200,5222005,14956,2022-12-31,Midwestern,47.0,0.0,0.0,0.0,7.0,...,20.058839,13.372560,0.000000,33.431399,0.000000,6.686280,6.686280,0.000000,6.686280,Urbanized
5566,522205,5222054,8768,2022-12-31,Midwestern,34.0,0.0,1.0,20.0,8.0,...,0.000000,0.000000,0.000000,11.405109,0.000000,0.000000,0.000000,0.000000,0.000000,Urbanized
5567,522220,5222203,4215,2022-12-31,Midwestern,7.0,0.0,0.0,0.0,2.0,...,0.000000,0.000000,0.000000,23.724792,0.000000,0.000000,23.724792,0.000000,0.000000,Semi-urbanized
5568,522230,5222302,5815,2022-12-31,Midwestern,11.0,0.0,1.0,2.0,3.0,...,51.590714,0.000000,0.000000,0.000000,0.000000,0.000000,17.196905,0.000000,0.000000,Semi-urbanized


In [4]:
df_cluster_probabilities = pd.read_csv('data/df_standardized_pca_2spherical_5_probability.csv', index_col=0)
df_cluster_probabilities.columns = ['Semi-urbanized', 'Urbanized', 'Rural with high human development', 'Urbanized with informal settlements', 'Rural with low human development']

In [5]:
df_without_collinearity_standardized = pd.read_csv('data/output/df_without_collinearity_standardized.csv', index_col=0)

In [6]:
df_vaccination = pd.read_csv('data/output/df_vaccination.csv', index_col=0)
df_political = pd.read_csv('data/output/df_political_without_missing_points.csv', index_col=0)[['percentual_votes_for_bolsonaro']]

In [7]:
df_deaths.columns

Index(['code_municipality_6', 'code_municipality_7', 'population', 'date',
       'region', 'deaths', 'new_deaths',
       'deaths_accumulated_first_semester_2020',
       'delta_first_death_2020-06-30', 'deaths_accumulated_2020',
       'delta_first_death_2020-12-31', 'deaths_accumulated_2021',
       'delta_first_death_2021-12-31', 'delta_first_death_2022-12-31',
       'delta_first_death_general_period',
       'death_rate_accumulated_first_semester_2020',
       'death_rate_accumulated_2020', 'death_rate_accumulated_2021',
       'Death rate (accumulated period)', 'Death rate (2022)',
       'Death rate (2021)', 'Death rate (2020)', 'Death rate (1/2020)',
       'deaths_2022', 'deaths_2021', 'death_accumulated_2020_04',
       'death_accumulated_2020_04_rate', 'death_accumulated_2020_06',
       'death_accumulated_2020_06_rate', 'death_accumulated_2020_08',
       'death_accumulated_2020_08_rate', 'death_accumulated_2020_10',
       'death_accumulated_2020_10_rate', 'death_accumula

## Correlation between political and vaccination data

In [8]:
votes_bolsonaro = df_political['percentual_votes_for_bolsonaro']

for column in ['% people fully vaccinated (2020-2022)', '% people fully vaccinated (1/2020)', '% people fully vaccinated (2020)', '% people fully vaccinated (2020-2021)']:
    print("\n"+column)    
    column_data = df_vaccination[column]
    
    # Calculate Pearson correlation
    pearson_corr, pearson_p_value = pearsonr(column_data, votes_bolsonaro)
    
    # Calculate Spearman correlation
    spearman_corr, spearman_p_value = spearmanr(column_data, votes_bolsonaro)
    
    # Print results
    print(f"Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p_value:.3f})")
    print(f"Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")


% people fully vaccinated (2020-2022)
Pearson correlation: 0.037 (p-value: 0.006)
Spearman correlation: 0.043 (p-value: 0.001)

% people fully vaccinated (1/2020)
Pearson correlation: nan (p-value: nan)
Spearman correlation: nan (p-value: nan)

% people fully vaccinated (2020)
Pearson correlation: nan (p-value: nan)
Spearman correlation: nan (p-value: nan)

% people fully vaccinated (2020-2021)
Pearson correlation: 0.181 (p-value: 0.000)
Spearman correlation: 0.200 (p-value: 0.000)




## Correlation between political and sociodemographic data

In [9]:
votes_bolsonaro = df_political['percentual_votes_for_bolsonaro']

for column in df_without_collinearity_standardized.columns:
    print("\n"+column)    
    column_data = df_without_collinearity_standardized[column]
    
    # Calculate Pearson correlation
    pearson_corr, pearson_p_value = pearsonr(column_data, votes_bolsonaro)
    
    # Calculate Spearman correlation
    spearman_corr, spearman_p_value = spearmanr(column_data, votes_bolsonaro)
    
    # Print results
    print(f"Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p_value:.3f})")
    print(f"Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")


percentage_population_age_range_60_more
Pearson correlation: 0.225 (p-value: 0.000)
Spearman correlation: 0.234 (p-value: 0.000)

percentage_urban_population
Pearson correlation: 0.351 (p-value: 0.000)
Spearman correlation: 0.367 (p-value: 0.000)

percentage_male_population
Pearson correlation: 0.035 (p-value: 0.008)
Spearman correlation: 0.003 (p-value: 0.800)

percentage_indigenous_population
Pearson correlation: -0.049 (p-value: 0.000)
Spearman correlation: -0.028 (p-value: 0.036)

density_median_effectively_domiciled_area
Pearson correlation: -0.087 (p-value: 0.000)
Spearman correlation: -0.106 (p-value: 0.000)

gini
Pearson correlation: -0.374 (p-value: 0.000)
Spearman correlation: -0.395 (p-value: 0.000)

percentage_estimated_households_in_informal_settlements
Pearson correlation: 0.041 (p-value: 0.002)
Spearman correlation: 0.107 (p-value: 0.000)

demographic_density_in_informal_settlements
Pearson correlation: 0.050 (p-value: 0.000)
Spearman correlation: 0.080 (p-value: 0.000)

## Correlation sociodemographic and vaccination data

In [10]:
for socio_column in df_without_collinearity_standardized.columns:
    print("\n"+socio_column)    
    socio_column_data = df_without_collinearity_standardized[socio_column]

    for column in ['% people fully vaccinated (2020-2022)', '% people fully vaccinated (1/2020)', '% people fully vaccinated (2020)', '% people fully vaccinated (2020-2021)']:
        print(column)    
        column_data = df_vaccination[column]
        
        # Calculate Pearson correlation
        df_combined = pd.DataFrame({'col1': socio_column_data, 'col2': column_data})
        df_combined = df_combined.dropna()
        pearson_corr, pearson_p_value = pearsonr(df_combined['col1'], df_combined['col2'])
        
        # Calculate Spearman correlation
        spearman_corr, spearman_p_value = spearmanr(column_data, socio_column_data)
        
        # Print results
        print(f"Pearson correlation: {pearson_corr:.3f} (p-value: {pearson_p_value:.3f})")
        # print(f"Spearman correlation: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")


percentage_population_age_range_60_more
% people fully vaccinated (2020-2022)
Pearson correlation: 0.564 (p-value: 0.000)
% people fully vaccinated (1/2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020-2021)
Pearson correlation: 0.664 (p-value: 0.000)

percentage_urban_population
% people fully vaccinated (2020-2022)
Pearson correlation: 0.079 (p-value: 0.000)
% people fully vaccinated (1/2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020-2021)
Pearson correlation: 0.119 (p-value: 0.000)

percentage_male_population
% people fully vaccinated (2020-2022)
Pearson correlation: -0.175 (p-value: 0.000)
% people fully vaccinated (1/2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020)
Pearson correlation: nan (p-value: nan)
% people fully vaccinated (2020-2021)
Pearson c



## Util

In [11]:
def pseudo_rsquared_cragg_uhler(model):
    r2_cox_snell = model.pseudo_rsquared('cs')
    print(2/model.nobs)
    r2_cragg_uhler = r2_cox_snell / (1 - model.llnull**(2/model.nobs))
    return r2_cragg_uhler

## Mortality analysis

### Goodness-of-fit statistics

In [12]:
dict_params = {}
dict_cov_params = {}

df_statistics = pd.DataFrame()
df_statistics['Statistic'] = ['Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']
df_statistics = df_statistics.set_index('Statistic')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    for model in [0,6,9,10]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        column_rate_ratio = period_label+' - Model '+model_label

        df_statistics.loc['Residual degrees of freedom', column_rate_ratio] = model.df_resid
        df_statistics.loc['Deviance', column_rate_ratio] = round(model.deviance, 2)
        df_statistics.loc["Pearson's chi-squared", column_rate_ratio] = round(model.pearson_chi2, 2)
        df_statistics.loc['$R_{CS}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('cs'), 2)
        df_statistics.loc['$R_{McF}^{2}$', column_rate_ratio] = round(model.pseudo_rsquared('mcf'), 2)
        df_statistics.loc['$LL_{\text{model}}$', column_rate_ratio] = round(model.llf, 2)
        df_statistics.loc['$LL_{\text{null}}$', column_rate_ratio] = round(model.llnull, 2)
        df_statistics.loc['Akaike Information Criterion (AIC)', column_rate_ratio] = round(model.aic, 2)
        df_statistics.loc['Bayesian information criterion (BIC)', column_rate_ratio] = round(model.bic_llf, 2)
        df_statistics.loc['Degrees of Freedom', column_rate_ratio] = round(model.df_model)

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()

        df_statistics[column_rate_ratio] = df_statistics[column_rate_ratio].fillna('1 [Reference]')

df_statistics.index = ['Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']

df_statistics.to_csv('data/output/df_statistics.csv', index=True)

### Mortality rate ratio using 'Urbanized' as the reference group

In [13]:
dict_params = {}
dict_cov_params = {}

df_cluster_model_coefficients = pd.DataFrame()
df_cluster_model_coefficients['Sociodemographic cluster'] = ['const', 'Urbanized', 'Urbanized with informal settlements', 'Semi-urbanized', 'Rural with high human development', 'Rural with low human development']
df_cluster_model_coefficients = df_cluster_model_coefficients.set_index('Sociodemographic cluster')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    mortality_rate = df_deaths.groupby('cluster_label')[death_rate_column].mean()
    ci_data = df_deaths.groupby('cluster_label')[death_rate_column].apply(calculate_95_ci)
    df_cluster_model_coefficients[['Mean', 'Lower CI', 'Upper CI']] = pd.DataFrame(ci_data.tolist(), index=ci_data.index).round(2)
    df_cluster_model_coefficients[period_label + ' - Death rate (95% CI)'] = df_cluster_model_coefficients['Mean'].astype(str) + '\n' + '(' + df_cluster_model_coefficients['Lower CI'].astype(str) + '-' + df_cluster_model_coefficients['Upper CI'].astype(str) + ')'
    df_cluster_model_coefficients = df_cluster_model_coefficients.drop(columns=['Mean', 'Lower CI', 'Upper CI'])

    for model in [9]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']

        # Calculate the rate ratios and their confidence intervals
        rate_ratios = np.exp(params)
        conf['Lower CI'] = np.exp(conf['Lower CI'])
        conf['Upper CI'] = np.exp(conf['Upper CI'])

        # Combine into a single DataFrame
        rate_ratio_df = pd.DataFrame({
            'Rate Ratio': rate_ratios,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        })

        rate_ratio_df.index = rate_ratio_df.index.astype(str).str.replace('cluster_label_','')

        rate_ratio_df = rate_ratio_df.round(2)

        column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
        df_cluster_model_coefficients[column_rate_ratio] = (
            rate_ratio_df['Rate Ratio'].apply(lambda x: f"{x:.2f}") + '\n' +
            '(' +
            rate_ratio_df['Lower CI'].apply(lambda x: f"{x:.2f}") + '-' +
            rate_ratio_df['Upper CI'].apply(lambda x: f"{x:.2f}") +
            ')'
        )

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()

        df_cluster_model_coefficients[column_rate_ratio] = df_cluster_model_coefficients[column_rate_ratio].fillna('1 [Reference]')

df_cluster_model_coefficients.index = ['Intercept', 'Urbanized', 'Urbanized with informal settlements', 'Semi-urbanized', 'Rural with high human development', 'Rural with low human development']

df_cluster_model_coefficients.to_csv('data/output/df_analysis_cluster.csv', index=True)

### Mortality rate ratios for variables

In [14]:
dict_params = {}
dict_cov_params = {}

df_results = pd.DataFrame()
df_results['Variable'] = ['const','percentage_population_age_range_60_more',
       'percentage_urban_population', 'density_median_effectively_domiciled_area',
       'percentage_male_population', 'percentage_indigenous_population', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry', 'expected_years_of_schooling_at_age_18',
       '% people fully vaccinated','percentage_votes_for_bolsonaro']
df_results = df_results.set_index('Variable')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    for model in [10]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']

        # Calculate the rate ratios and their confidence intervals
        rate_ratios = np.exp(params)
        conf['Lower CI'] = np.exp(conf['Lower CI'])
        conf['Upper CI'] = np.exp(conf['Upper CI'])

        # Combine into a single DataFrame
        rate_ratio_df = pd.DataFrame({
            'Rate Ratio': rate_ratios,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        })

        rate_ratio_df = rate_ratio_df.round(2)

        column_rate_ratio = period_label+' - RR (95% CI) - Model '+model_label
        df_results[column_rate_ratio] = (
            rate_ratio_df['Rate Ratio'].apply(lambda x: f"{x:.2f}") + '\n' +
            '(' +
            rate_ratio_df['Lower CI'].apply(lambda x: f"{x:.2f}") + '-' +
            rate_ratio_df['Upper CI'].apply(lambda x: f"{x:.2f}") +
            ')'
        )

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()

df_results.index = ['Intercept', '% population 60+ years', '% urban population', 'Median density of effectively domiciled areas (inhabitants/km²)', '% male population', '% indigenous population', 'Gini coefficient', '% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18','% people fully vaccinated','% votes for Bolsonaro']

df_results.to_csv('data/output/df_analysis_variables_rate_ratios.csv', index=True)

### Mortality coefficients and statistics

#### Model 0: regions

In [21]:
dict_params = {}
dict_cov_params = {}

df_results = pd.DataFrame()
df_results['Variable'] = ['const','Sudeste', 'Norte', 'Nordeste', 'Centro-Oeste', 'Sul','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']
df_results = df_results.set_index('Variable')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    for model in [0]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']

        # Combine into a single DataFrame
        coefficient_df = pd.DataFrame({
            'Coefficient': params,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        })

        coefficient_df.index = coefficient_df.index.astype(str).str.replace('region_','')

        coefficient_df = coefficient_df.round(2)

        column_coefficient = period_label+' - Coefficient (95% CI) - Model '+model_label
        df_results[column_coefficient] = (
            coefficient_df['Coefficient'].apply(lambda x: f"{x:.2f}") + '\n' +
            '(' +
            coefficient_df['Lower CI'].apply(lambda x: f"{x:.2f}") + '-' +
            coefficient_df['Upper CI'].apply(lambda x: f"{x:.2f}") +
            ')'
        )

        df_results.loc['Residual degrees of freedom', column_coefficient] = model.df_resid
        df_results.loc['Deviance', column_coefficient] = round(model.deviance, 2)
        df_results.loc["Pearson's chi-squared", column_coefficient] = round(model.pearson_chi2, 2)
        df_results.loc['$R_{CS}^{2}$', column_coefficient] = round(model.pseudo_rsquared('cs'), 2)
        df_results.loc['$R_{McF}^{2}$', column_coefficient] = round(model.pseudo_rsquared('mcf'), 2)
        df_results.loc['$LL_{\text{model}}$', column_coefficient] = round(model.llf, 2)
        df_results.loc['$LL_{\text{null}}$', column_coefficient] = round(model.llnull, 2)
        df_results.loc['Akaike Information Criterion (AIC)', column_coefficient] = round(model.aic, 2)
        df_results.loc['Bayesian information criterion (BIC)', column_coefficient] = round(model.bic_llf, 2)
        df_results.loc['Degrees of Freedom', column_coefficient] = round(model.df_model)

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()

        df_results[column_coefficient] = df_results[column_coefficient].fillna('1 [Reference]')

df_results.index = ['Intercept', 'Southeast', 'North', 'Northeast', 'Midwestern', 'South','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']

df_results.to_csv('data/output/df_analysis_coefficients_statistics_model_0.csv', index=True)

#### Model 6 and 9: sociodemographic clusters

In [23]:
dict_params = {}
dict_cov_params = {}

df_results = pd.DataFrame()
df_results['Variable'] = ['const','Urbanized', 'Urbanized with informal settlements', 'Semi-urbanized', 'Rural with high human development', 'Rural with low human development','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']
df_results = df_results.set_index('Variable')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    for model in [6,9]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']

        # Combine into a single DataFrame
        coefficient_df = pd.DataFrame({
            'Coefficient': params,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        })

        coefficient_df.index = coefficient_df.index.astype(str).str.replace('cluster_label_','')

        coefficient_df = coefficient_df.round(2)

        column_coefficient = period_label+' - Coefficient (95% CI) - Model '+model_label
        df_results[column_coefficient] = (
            coefficient_df['Coefficient'].apply(lambda x: f"{x:.2f}") + '\n' +
            '(' +
            coefficient_df['Lower CI'].apply(lambda x: f"{x:.2f}") + '-' +
            coefficient_df['Upper CI'].apply(lambda x: f"{x:.2f}") +
            ')'
        )

        df_results.loc['Residual degrees of freedom', column_coefficient] = model.df_resid
        df_results.loc['Deviance', column_coefficient] = round(model.deviance, 2)
        df_results.loc["Pearson's chi-squared", column_coefficient] = round(model.pearson_chi2, 2)
        df_results.loc['$R_{CS}^{2}$', column_coefficient] = round(model.pseudo_rsquared('cs'), 2)
        df_results.loc['$R_{McF}^{2}$', column_coefficient] = round(model.pseudo_rsquared('mcf'), 2)
        df_results.loc['$LL_{\text{model}}$', column_coefficient] = round(model.llf, 2)
        df_results.loc['$LL_{\text{null}}$', column_coefficient] = round(model.llnull, 2)
        df_results.loc['Akaike Information Criterion (AIC)', column_coefficient] = round(model.aic, 2)
        df_results.loc['Bayesian information criterion (BIC)', column_coefficient] = round(model.bic_llf, 2)
        df_results.loc['Degrees of Freedom', column_coefficient] = round(model.df_model)

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()

        df_results[column_coefficient] = df_results[column_coefficient].fillna('1 [Reference]')

df_results.index = ['Intercept', 'Urbanized', 'Urbanized with informal settlements', 'Semi-urbanized', 'Rural with high human development', 'Rural with low human development','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']

df_results.to_csv('data/output/df_analysis_coefficients_statistics_model_6_9.csv', index=True)

#### Model 10: variables

In [15]:
dict_params = {}
dict_cov_params = {}

df_results = pd.DataFrame()
df_results['Variable'] = ['const','percentage_population_age_range_60_more',
       'percentage_urban_population', 'density_median_effectively_domiciled_area',
       'percentage_male_population', 'percentage_indigenous_population', 'gini',
       'percentage_estimated_households_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'percentage_self_employed_workers', 'unemployment_rate',
       'percentage_workers_commerce', 'percentage_workers_services',
       'percentage_workers_industry', 'expected_years_of_schooling_at_age_18',
       '% people fully vaccinated','percentage_votes_for_bolsonaro','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']
df_results = df_results.set_index('Variable')

list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_death_rate_columns = ['Death rate (1/2020)', 'Death rate (2020)', 'Death rate (2021)', 'Death rate (2022)', 'Death rate (accumulated period)']
list_period_labels = ['2020 (first half)','2020','2021','2022','2020-2022']

for i in range(len(list_periods)):
    period = list_periods[i]
    period_label = list_period_labels[i]
    death_rate_column = list_death_rate_columns[i]

    for model in [10]:
        model_label = str(model)
        model_file = str(model)
        with open('models/model_'+model_file+'_'+period+'.pkl', 'rb') as file:
            model = pickle.load(file)

        # Extract the coefficients and standard errors
        params = model.params[0:]
        conf = model.conf_int()[0:]
        conf.columns = ['Lower CI', 'Upper CI']
        
        # Combine into a single DataFrame
        coefficient_df = pd.DataFrame({
            'Coefficient': params,
            'Lower CI': conf['Lower CI'],
            'Upper CI': conf['Upper CI']
        }) 
        
        coefficient_df.index = coefficient_df.index.astype(str).str.replace('cluster_label_','')
        
        coefficient_df = coefficient_df.round(2)
        
        column_coefficient = period_label+' - Coefficient (95% CI) - Model '+model_label
        df_results[column_coefficient] = (
            coefficient_df['Coefficient'].apply(lambda x: f"{x:.2f}") + '\n' +
            '(' +
            coefficient_df['Lower CI'].apply(lambda x: f"{x:.2f}") + '-' +
            coefficient_df['Upper CI'].apply(lambda x: f"{x:.2f}") +
            ')'
        )

        df_results.loc['Residual degrees of freedom', column_coefficient] = model.df_resid
        df_results.loc['Deviance', column_coefficient] = round(model.deviance, 2)
        df_results.loc["Pearson's chi-squared", column_coefficient] = round(model.pearson_chi2, 2)
        df_results.loc['$R_{CS}^{2}$', column_coefficient] = round(model.pseudo_rsquared('cs'), 2)
        df_results.loc['$R_{McF}^{2}$', column_coefficient] = round(model.pseudo_rsquared('mcf'), 2)
        df_results.loc['$LL_{\text{model}}$', column_coefficient] = round(model.llf, 2)
        df_results.loc['$LL_{\text{null}}$', column_coefficient] = round(model.llnull, 2)
        df_results.loc['Akaike Information Criterion (AIC)', column_coefficient] = round(model.aic, 2)
        df_results.loc['Bayesian information criterion (BIC)', column_coefficient] = round(model.bic_llf, 2)
        df_results.loc['Degrees of Freedom', column_coefficient] = round(model.df_model)

        dict_params[period_label+' - '+model_label] = model.params
        dict_cov_params[period_label+' - '+model_label] = model.cov_params()       

df_results.index = ['Intercept', '% population 60+ years', '% urban population', 'Median density of effectively domiciled areas (inhabitants/km²)', '% male population', '% indigenous population', 'Gini coefficient', '% informal settlement households', 'Population density in informal settlement (inhabitants/ha)', '% sanitation-related hospitalizations', '% self-employed workers', 'Unemployment rate', '% commerce workers', '% service workers', '% industry workers', 'Expected years of schooling at age 18','% people fully vaccinated','% votes for Bolsonaro','Degrees of Freedom', 'Residual degrees of freedom', 'Deviance', "Pearson's chi-squared", '$R_{CS}^{2}$', '$R_{McF}^{2}$', '$LL_{\text{model}}$', '$LL_{\text{null}}$', 'Akaike Information Criterion (AIC)', 'Bayesian information criterion (BIC)']

df_results.to_csv('data/output/df_analysis_coefficients_statistics_model_10.csv', index=True)