# Variable sensitive analysis

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import pickle

from util import save_model, summarize_results

## Loading data

In [2]:
df_variables = pd.read_csv('data/output/df_without_collinearity_standardized.csv', index_col=0)

In [3]:
df_y = pd.read_csv('data/output/df_mortality.csv', index_col=0)

In [4]:
list_population =  df_y['population']
list_offset = np.log(list_population)

In [5]:
df_cluster_labels = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)[['cluster_label']].copy()
custom_order = pd.CategoricalDtype(
    categories=[
        'Urbanized',
        'Urbanized with informal settlements',        
        'Semi-urbanized',
        'Rural with high human development',
        'Rural with low human development'
    ],
    ordered=True
)
df_cluster_labels['cluster_label'] = df_cluster_labels['cluster_label'].astype(custom_order)
df_cluster_labels = df_cluster_labels.sort_values('cluster_label') 

In [6]:
df_cluster_probabilities = pd.read_csv('data/output/df_standardized_pca_all_spherical_5_probability.csv', index_col=0)
df_cluster_probabilities.columns = ['Semi-urbanized', 'Rural with high human development', 'Urbanized with informal settlements', 'Rural with low human development', 'Urbanized']

## Variable Sensitivity Analysis

In [7]:
def tunning_negative_binomial_model(x, y, list_offset):
    list_nb_glm_models = np.array([])
    list_llf_models = np.array([])
    list_alpha = np.arange(0.01, 2.1, 0.01)

    for alpha in list_alpha:
        nb_glm_model = sm.GLM(y, x, family=sm.families.NegativeBinomial(alpha=alpha), offset=list_offset).fit()
        llf = nb_glm_model.llf
        list_nb_glm_models = np.append(list_nb_glm_models, nb_glm_model)
        list_llf_models = np.append(list_llf_models, llf)

    print('Selected alpha:', list_alpha[abs(list_llf_models) == min(abs(list_llf_models))][0])
    return list_nb_glm_models[abs(list_llf_models) == min(abs(list_llf_models))][0]

In [8]:
list_columns_y = ['deaths_accumulated_first_semester_2020', 'deaths_accumulated_2020', 'deaths_2021', 'deaths_2022', 'deaths']
list_periods = ['2020_1','2020', '2021', '2022', '2020_2022']
list_delta_first_death_columns = ['delta_first_death_2020-06-30', 'delta_first_death_2020-12-31', 'delta_first_death_2021-12-31', 'delta_first_death_2022-12-31', 'delta_first_death_general_period']
   
for i in range(5):
    column_y = list_columns_y[i]
    period = list_periods[i]
    column_delta_first_death = list_delta_first_death_columns[i]
    print('\n*** Period: ', period)
        
    y = df_y[column_y]
       
    # Model 2      
    print('\*** Model 2')
    print('===>Full model:')
    with open('models/model_2_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)
    summarize_results(model)
    
    x = df_cluster_labels.copy()
    x = pd.get_dummies(x, columns=['cluster_label'], drop_first=True, dtype=int)
    x = x.sort_index()        
    x = pd.concat([x,df_variables], axis=1)    
    scaler = StandardScaler()
    delta_standardized = scaler.fit_transform(df_y[[column_delta_first_death]])
    x['delta_first_death_period'] = delta_standardized[:,0]    

    for variable in x.columns:
        print('\n*** Removed variable: ',variable)
        x_analysis = x.drop(columns=[variable])
        x_analysis = sm.add_constant(x_analysis)
        model = tunning_negative_binomial_model(x_analysis,y,list_offset)        
        filename = 'model_2_'+period+'_'+variable
        save_model(model,filename,'models/sensitivity_analysis/variable')
        summarize_results(model)
        
    # Model 3    
    print('\*** Model 3')
    print('===>Full model:')
    with open('models/model_3_'+period+'.pkl', 'rb') as file:
        model = pickle.load(file)
    summarize_results(model)
    
    x = df_cluster_probabilities.copy()
    x['delta_first_death_period'] = df_y[column_delta_first_death]
    scaler = StandardScaler()
    x_standardized = scaler.fit_transform(x)
    x = pd.DataFrame(x_standardized, index=x.index, columns=x.columns)        
    
    for variable in x.columns:
        print('\n*** Removed variable: ',variable)
        x_analysis = x.drop(columns=[variable])
        x_analysis = sm.add_constant(x_analysis)
        model = tunning_negative_binomial_model(x_analysis,y,list_offset)        
        filename = 'model_3_'+period+'_'+variable
        save_model(model,filename,'models/sensitivity_analysis/variable')
        summarize_results(model)    


*** Period:  2020_1
\*** Model 2
===>Full model:
                           Generalized Linear Model Regression Results                            
Dep. Variable:     deaths_accumulated_first_semester_2020   No. Observations:                 5560
Model:                                                GLM   Df Residuals:                     5538
Model Family:                            NegativeBinomial   Df Model:                           21
Link Function:                                        Log   Scale:                          1.0000
Method:                                              IRLS   Log-Likelihood:                -9126.6
Date:                                    Thu, 06 Jun 2024   Deviance:                       4528.0
Time:                                            16:18:24   Pearson chi2:                 5.78e+03
No. Iterations:                                        13   Pseudo R-squ. (CS):             0.6433
Covariance Type:                                nonrobust  