# Analysis the odds ratio with outliers muncipalities
## Workflow: to execute after 06_risk_factor_measures

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import stats

## Loading data

In [2]:
df_base = pd.read_csv('data/df_labeled_risk_factors.csv', index_col=0)

In [3]:
df_base.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO.unique()

array(['High', 'Very high', 'Intermediate', 'Extremely high', 'Low',
       'Very low'], dtype=object)

## Util

In [4]:
def _calculate_odds_ratio(df_test, df_control):
    a = len(df_test[(df_test.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'High') | (df_test.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'Very high') | (df_test.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'Extremely high')])
    b = len(df_test) - a
    c = len(df_control[(df_control.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'High') | (df_control.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'Very high') | (df_control.NIVEL_LABEL_TAXA_OBITOS_ACUMULADO == 'Extremely high')])
    d = len(df_control) - c
    truth_table = [[a, b], [c, d]]
    oddsratio, pvalue = stats.fisher_exact(truth_table)
    return oddsratio, pvalue

def _calculate_odds_ratio_control_random_weighted(df_test, test_quantity):
    list_oddsratio = []
    list_pvalue = []

    for i in range(test_quantity):
        quantity_in_group = df_test.groupby('k5_label').count()['CODIGO_MUNICIPIO_6'].reset_index().values
        df_control_weighted = pd.DataFrame()
        for row in quantity_in_group:
            group = row[0]
            control_quantity = row[1] * 3
            df_temp = df_base[(df_base.index.isin(df_test.index) == False) & (df_base.k5_label == group)].sample(control_quantity, replace=True)
            df_control_weighted = df_control_weighted.append(df_temp)
        oddsratio, pvalue = _calculate_odds_ratio(df_test, df_control_weighted)
        list_oddsratio.append(round(oddsratio, 2))
        list_pvalue.append(round(pvalue, 3))
    result_odds_ratio = (round(np.mean(list_oddsratio), 3), round(np.std(list_oddsratio), 3))
    result_pvalue = (round(np.mean(list_pvalue), 3), round(np.std(list_pvalue), 3))

    return result_odds_ratio, result_pvalue

def _calculate_odds_ratio_control_subtracted(df_test):
    df_control = df_base[(df_base.index.isin(df_test.index) == False)]
    oddsratio, pvalue = _calculate_odds_ratio(df_test, df_control)
    return round(oddsratio, 2), round(pvalue, 3)

def _calculate_odds_ratio_control_opsite_opposite_end(df_test, atributte, test_sample_proportion):
    control_sample_proportion = test_sample_proportion * 3
    limit_value = df_base[atributte].quantile(control_sample_proportion)
    df_control = df_base.loc[(df_base[atributte] <= limit_value)]
    oddsratio, pvalue = _calculate_odds_ratio(df_test, df_control)
    return round(oddsratio, 2), round(pvalue, 3)

def calculate_odds_ratio(atributte, test_sample_proportion = 0.05, test_quantity=100):
    list_oddsratio = []
    list_pvalue = []

    limit_value = df_base[atributte].quantile(1 - test_sample_proportion)
    df_test = df_base.loc[(df_base[atributte] >= limit_value)]

    oddsratio, pvalue = _calculate_odds_ratio_control_opsite_opposite_end(df_test, atributte, test_sample_proportion)
    list_oddsratio.append(oddsratio)
    list_pvalue.append(pvalue)

    oddsratio, pvalue = _calculate_odds_ratio_control_subtracted(df_test)
    list_oddsratio.append(oddsratio)
    list_pvalue.append(pvalue)

    oddsratio, pvalue = _calculate_odds_ratio_control_random_weighted(df_test, test_quantity)
    list_oddsratio.append(oddsratio)
    list_pvalue.append(pvalue)

    return list_oddsratio, list_pvalue

## Tested attributes

In [5]:
yticklabels_without_number = [
    '% population 0 - 5 years',
    '% population 6 - 14 years',
    '% population 15 - 24 years',
    '% population 25 - 39 years',
    '% population 40 - 59 years',
    '% population +60 years',
    '% urban population',
    'IDHM - income',
    'Gini coefficient',
    'Social transfer\nper capita',
    'Activity rate',
    '% informal workers',
    '% employed in agriculture',
    '% employed in commerce',
    '% employed in service',
    '% employed in industry']

selected_attributes = ['PERCENTUAL_POPULACAO_0_A_5_ANOS',
          'PERCENTUAL_POPULACAO_6_A_14_ANOS',
          'PERCENTUAL_POPULACAO_15_A_24_ANOS',
          'PERCENTUAL_POPULACAO_25_A_39_ANOS',
          'PERCENTUAL_POPULACAO_40_A_59_ANOS',
          'PERCENTUAL_POPULACAO_60_ANOS_OU_MAIS',
         'PERCENTUAL_POPULACAO_URBANA',
         'IDHM_RENDA',
          'GINI',
          'TRANSFERENCIA_PERCAPTA_BOLSA_FAMILIA',
         'TAXA_ATIVIDADE',
         'PERCENTUAL_TRABALHADORES_INFORMAIS',
          'PERCENTUAL_OCUPADOS_AGROPECUARIA',
          'PERCENTUAL_OCUPADOS_COMERCIO',
         'PERCENTUAL_OCUPADOS_SERVICO',
         'PERCENTUAL_OCUPADOS_INDUSTRIA']

df_oddsratio = pd.DataFrame(columns=['attribute', 'odds_ratio_control_opposite_end', 'p_value_control_opposite_end', 'odds_ratio_control_subtracted', 'p_value_control_subtracted', 'odds_ratio_control_random_weighted', 'p_value_control_random_weighted'])

for i in range(len(selected_attributes)):
    attribute = selected_attributes[i]
    label = yticklabels_without_number[i]
    print('\n'+label)

    oddsratio, pvalue = calculate_odds_ratio(attribute)
    print(oddsratio)
    print(pvalue)

    df_oddsratio = df_oddsratio.append({'attribute': label, 'odds_ratio_control_opposite_end': oddsratio[0], 'p_value_control_opposite_end': pvalue[0], 'odds_ratio_control_subtracted': oddsratio[1], 'p_value_control_subtracted': pvalue[1], 'odds_ratio_control_random_weighted': oddsratio[2], 'p_value_control_random_weighted': pvalue[2]}, ignore_index=True)

df_oddsratio


% population 0 - 5 years
[0.1, 0.17, (0.663, 0.07)]
[0.0, 0.0, (0.087, 0.104)]

% population 6 - 14 years
[0.07, 0.16, (0.679, 0.069)]
[0.0, 0.0, (0.118, 0.137)]

% population 15 - 24 years
[0.08, 0.15, (0.408, 0.03)]
[0.0, 0.0, (0.0, 0.0)]

% population 25 - 39 years
[2.47, 1.66, (0.574, 0.042)]
[0.0, 0.0, (0.0, 0.001)]

% population 40 - 59 years
[7.6, 1.15, (0.94, 0.061)]
[0.0, 0.289, (0.652, 0.244)]

% population +60 years
[2.87, 1.64, (1.277, 0.085)]
[0.0, 0.0, (0.123, 0.13)]

% urban population
[21.35, 6.77, (2.069, 0.133)]
[0.0, 0.0, (0.0, 0.0)]

IDHM - income
[25.06, 2.5, (0.928, 0.078)]
[0.0, 0.0, (0.563, 0.265)]

Gini coefficient
[0.22, 0.37, (0.759, 0.046)]
[0.0, 0.0, (0.089, 0.077)]

Social transfer
per capita
[0.05, 0.1, (0.59, 0.059)]
[0.0, 0.0, (0.058, 0.067)]

Activity rate
[2.35, 0.62, (0.416, 0.026)]
[0.0, 0.0, (0.0, 0.0)]

% informal workers
[0.02, 0.08, (0.33, 0.03)]
[0.0, 0.0, (0.0, 0.0)]

% employed in agriculture
[0.06, 0.22, (0.413, 0.028)]
[0.0, 0.0, (0.0, 0.0

Unnamed: 0,attribute,odds_ratio_control_opposite_end,p_value_control_opposite_end,odds_ratio_control_subtracted,p_value_control_subtracted,odds_ratio_control_random_weighted,p_value_control_random_weighted
0,% population 0 - 5 years,0.1,0.0,0.17,0.0,"(0.663, 0.07)","(0.087, 0.104)"
1,% population 6 - 14 years,0.07,0.0,0.16,0.0,"(0.679, 0.069)","(0.118, 0.137)"
2,% population 15 - 24 years,0.08,0.0,0.15,0.0,"(0.408, 0.03)","(0.0, 0.0)"
3,% population 25 - 39 years,2.47,0.0,1.66,0.0,"(0.574, 0.042)","(0.0, 0.001)"
4,% population 40 - 59 years,7.6,0.0,1.15,0.289,"(0.94, 0.061)","(0.652, 0.244)"
5,% population +60 years,2.87,0.0,1.64,0.0,"(1.277, 0.085)","(0.123, 0.13)"
6,% urban population,21.35,0.0,6.77,0.0,"(2.069, 0.133)","(0.0, 0.0)"
7,IDHM - income,25.06,0.0,2.5,0.0,"(0.928, 0.078)","(0.563, 0.265)"
8,Gini coefficient,0.22,0.0,0.37,0.0,"(0.759, 0.046)","(0.089, 0.077)"
9,Social transfer\nper capita,0.05,0.0,0.1,0.0,"(0.59, 0.059)","(0.058, 0.067)"


In [6]:
filename = 'data/df_oddsratio.csv'
df_oddsratio.to_csv(filename, index=True)

In [7]:
df_oddsratio

Unnamed: 0,attribute,odds_ratio_control_opposite_end,p_value_control_opposite_end,odds_ratio_control_subtracted,p_value_control_subtracted,odds_ratio_control_random_weighted,p_value_control_random_weighted
0,% population 0 - 5 years,0.1,0.0,0.17,0.0,"(0.663, 0.07)","(0.087, 0.104)"
1,% population 6 - 14 years,0.07,0.0,0.16,0.0,"(0.679, 0.069)","(0.118, 0.137)"
2,% population 15 - 24 years,0.08,0.0,0.15,0.0,"(0.408, 0.03)","(0.0, 0.0)"
3,% population 25 - 39 years,2.47,0.0,1.66,0.0,"(0.574, 0.042)","(0.0, 0.001)"
4,% population 40 - 59 years,7.6,0.0,1.15,0.289,"(0.94, 0.061)","(0.652, 0.244)"
5,% population +60 years,2.87,0.0,1.64,0.0,"(1.277, 0.085)","(0.123, 0.13)"
6,% urban population,21.35,0.0,6.77,0.0,"(2.069, 0.133)","(0.0, 0.0)"
7,IDHM - income,25.06,0.0,2.5,0.0,"(0.928, 0.078)","(0.563, 0.265)"
8,Gini coefficient,0.22,0.0,0.37,0.0,"(0.759, 0.046)","(0.089, 0.077)"
9,Social transfer\nper capita,0.05,0.0,0.1,0.0,"(0.59, 0.059)","(0.058, 0.067)"


In [8]:
df_base[['k5_label', 'PERCENTUAL_POPULACAO_URBANA']].groupby('k5_label').describe()

Unnamed: 0_level_0,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA,PERCENTUAL_POPULACAO_URBANA
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
k5_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
High SV,1078.0,60.832646,17.324341,8.908116,48.308254,61.928538,74.302471,98.904179
Intermediate SV,948.0,78.37563,12.797314,29.62034,69.747152,79.68514,88.160414,100.0
Rural with low SV,825.0,45.433594,15.674658,5.521845,33.333333,46.51105,56.983161,92.593402
Very high SV,1538.0,48.939215,16.819225,4.178855,36.555097,48.736304,60.966647,100.0
Very low SV,1176.0,87.215144,9.844206,41.777151,81.136576,89.223446,95.022127,100.0
