# Summarizing statistics

In [51]:
import numpy as np
import pandas as pd

## Loading data

In [52]:
df_variables = pd.read_csv("data/output/df_base_without_missing_points.csv", index_col=0)

In [53]:
df_labeled_cluster = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)

## Analysis

### Analysis for national data

In [54]:
df_variables_analysis = df_variables[df_variables.columns[7:]]

In [55]:
statistics = []
for column in df_variables_analysis.columns:
    mean = df_variables_analysis[column].mean()
    std_dev = df_variables_analysis[column].std()
    median = df_variables_analysis[column].median()
    q1 = df_variables_analysis[column].quantile(0.25)
    q3 = df_variables_analysis[column].quantile(0.75)
    statistics.append({
        'Variable': column,
        'Mean (SD)': f"{mean:.2f} ({std_dev:.2f})",
        'Median (Q1, Q3)': f"{median:.2f} ({q1:.2f}, {q3:.2f})"
    })

# Create a new DataFrame with the calculated statistics
df_statistics = pd.DataFrame(statistics)

In [56]:
df_statistics = df_statistics.set_index('Variable')

In [57]:
len(df_statistics)

31

In [58]:
df_statistics.index

Index(['percentage_population_age_range_0_19',
       'percentage_population_age_range_20_39',
       'percentage_population_age_range_40_59',
       'percentage_population_age_range_60_more',
       'percentage_urban_population', 'average_residents_per_households',
       'percentage_population_in_households_more_2_residents_per_bedroom',
       'percentage_male_population', 'percentage_indigenous_population',
       'percentage_black_and_brown_population', 'life_expectancy_at_birth',
       'density_median_effectively_domiciled_area', 'per_capita_income',
       'gini', 'per_capita_cash_transfer_program',
       'percentage_estimated_households_in_informal_settlements',
       'percentage_population_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_population_in_households_without_bathroom',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'activity_rate', 'percentage_self_employed_workers',
       'unemploym

In [59]:
variable_labels = ['% population 0-19 years',
                   '% population 20-39 years',
                   '% population 40-59 years',
                   '% population 60+ years',
                   '% urban population',
                   'Average household size',
                   '% crowded households',
                   '% male population',
                   '% indigenous population',
                   '% black and brown population',
                   'Life expectancy (years)',
                   'Median density of effectively domiciled areas (inhabitants/km²)',
                   'Per capita income (BRL)',
                   'Gini coefficient',
                   'Social transfer per capita (BRL)',
                   '% informal settlement households',
                   '% population in informal settlements',
                   'Population density in informal settlement (inhabitants/ha)',
                   '% households without bathroom',
                   '% sanitation-related hospitalizations',
                   'Activity rate',
                   '% self-employed workers', 
                   'Unemployment rate',
                   '% informal workers',
                   '% poor population spending 1+ hour to work',
                   '% agriculture workers',
                   '% commerce workers', 
                   '% service workers', 
                   '% industry workers',
                   'Illiteracy rate',
                   'Expected years of schooling at age 18']

In [60]:
df_statistics.index = variable_labels

In [61]:
variable_labels

['% population 0-19 years',
 '% population 20-39 years',
 '% population 40-59 years',
 '% population 60+ years',
 '% urban population',
 'Average household size',
 '% crowded households',
 '% male population',
 '% indigenous population',
 '% black and brown population',
 'Life expectancy (years)',
 'Median density of effectively domiciled areas (inhabitants/km²)',
 'Per capita income (BRL)',
 'Gini coefficient',
 'Social transfer per capita (BRL)',
 '% informal settlement households',
 '% population in informal settlements',
 'Population density in informal settlement (inhabitants/ha)',
 '% households without bathroom',
 '% sanitation-related hospitalizations',
 'Activity rate',
 '% self-employed workers',
 'Unemployment rate',
 '% informal workers',
 '% poor population spending 1+ hour to work',
 '% agriculture workers',
 '% commerce workers',
 '% service workers',
 '% industry workers',
 'Illiteracy rate',
 'Expected years of schooling at age 18']

In [62]:
df_statistics_sorted = df_statistics.loc[np.concatenate((variable_labels[:4], variable_labels[11:12], variable_labels[4:11], variable_labels[12:]))]

In [63]:
df_statistics_sorted

Unnamed: 0,Mean (SD),"Median (Q1, Q3)"
% population 0-19 years,27.55 (4.85),"26.88 (24.27, 30.12)"
% population 20-39 years,28.76 (2.96),"28.87 (26.96, 30.57)"
% population 40-59 years,26.28 (2.66),"26.72 (24.88, 28.07)"
% population 60+ years,17.43 (4.59),"17.26 (14.33, 20.27)"
Median density of effectively domiciled areas (inhabitants/km²),2880.61 (2043.02),"2447.15 (1585.06, 3610.04)"
% urban population,63.81 (22.04),"64.64 (47.07, 82.16)"
Average household size,2.84 (0.32),"2.78 (2.65, 2.95)"
% crowded households,25.13 (12.99),"23.07 (15.41, 32.58)"
% male population,50.00 (1.58),"49.89 (49.07, 50.77)"
% indigenous population,1.20 (6.14),"0.07 (0.03, 0.19)"


In [64]:
df_statistics_sorted.to_csv('data/output/df_sociodemographic_statistics.csv', index=True)

### Analysis by cluster

In [65]:
df_variables['cluster_label'] = df_labeled_cluster['cluster_label']

In [66]:
df_variables_analysis = df_variables[df_variables.columns[7:]]

In [67]:
labels_sorted = ['Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development']

In [68]:
statistics = []
for column in df_variables_analysis.columns[:-1]:
    row = {'Variable': column}
    for cluster in labels_sorted:        
        df_variables_analysis_cluster = df_variables_analysis[df_variables_analysis['cluster_label'] == cluster]
        median = df_variables_analysis_cluster[column].median()
        q1 = df_variables_analysis_cluster[column].quantile(0.25)
        q3 = df_variables_analysis_cluster[column].quantile(0.75)
        row[cluster] = f"{median:.2f} ({q1:.2f}, {q3:.2f})"
    statistics.append(row)

# Create a new DataFrame with the calculated statistics
df_statistics = pd.DataFrame(statistics)

In [69]:
df_statistics = df_statistics.set_index('Variable')

In [70]:
df_statistics

Unnamed: 0_level_0,Urbanized,Urbanized with informal settlements,Semi-urbanized,Rural with high human development,Rural with low human development
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
percentage_population_age_range_0_19,"25.06 (23.44, 26.83)","27.79 (25.70, 30.45)","29.60 (27.60, 31.81)","23.05 (21.10, 24.95)","38.21 (35.66, 41.98)"
percentage_population_age_range_20_39,"28.63 (27.03, 30.30)","31.59 (30.13, 33.23)","29.29 (27.94, 30.68)","25.59 (23.88, 27.16)","30.77 (29.41, 31.96)"
percentage_population_age_range_40_59,"27.57 (26.69, 28.48)","26.81 (24.99, 27.77)","25.16 (23.92, 26.39)","28.38 (27.40, 29.44)","20.12 (18.37, 21.86)"
percentage_population_age_range_60_more,"18.56 (16.24, 20.80)","13.59 (11.14, 15.99)","15.74 (13.77, 17.88)","22.53 (20.23, 25.41)","10.26 (8.20, 12.53)"
percentage_urban_population,"82.69 (74.07, 90.72)","95.17 (88.42, 99.28)","54.48 (41.14, 65.88)","47.36 (34.06, 58.47)","43.27 (31.43, 53.02)"
average_residents_per_households,"2.69 (2.61, 2.78)","2.83 (2.71, 2.96)","2.92 (2.80, 3.05)","2.62 (2.54, 2.71)","3.55 (3.34, 4.02)"
percentage_population_in_households_more_2_residents_per_bedroom,"17.50 (13.50, 22.00)","31.27 (25.54, 38.49)","30.88 (25.62, 36.66)","11.63 (8.24, 15.72)","53.40 (45.70, 65.30)"
percentage_male_population,"49.53 (48.89, 50.20)","48.16 (47.53, 48.86)","50.01 (49.21, 50.86)","50.65 (49.99, 51.32)","51.18 (50.34, 51.91)"
percentage_indigenous_population,"0.07 (0.03, 0.13)","0.15 (0.09, 0.29)","0.09 (0.03, 0.31)","0.03 (0.00, 0.10)","0.16 (0.05, 8.18)"
percentage_black_and_brown_population,"43.96 (32.51, 56.62)","68.82 (55.23, 74.98)","74.63 (66.63, 80.69)","25.56 (15.04, 41.89)","81.48 (74.50, 85.97)"


In [71]:
df_statistics.index = variable_labels

In [72]:
df_statistics_sorted = df_statistics.loc[np.concatenate((variable_labels[:4], variable_labels[11:12], variable_labels[4:11], variable_labels[12:]))]

In [73]:
df_statistics_sorted

Unnamed: 0,Urbanized,Urbanized with informal settlements,Semi-urbanized,Rural with high human development,Rural with low human development
% population 0-19 years,"25.06 (23.44, 26.83)","27.79 (25.70, 30.45)","29.60 (27.60, 31.81)","23.05 (21.10, 24.95)","38.21 (35.66, 41.98)"
% population 20-39 years,"28.63 (27.03, 30.30)","31.59 (30.13, 33.23)","29.29 (27.94, 30.68)","25.59 (23.88, 27.16)","30.77 (29.41, 31.96)"
% population 40-59 years,"27.57 (26.69, 28.48)","26.81 (24.99, 27.77)","25.16 (23.92, 26.39)","28.38 (27.40, 29.44)","20.12 (18.37, 21.86)"
% population 60+ years,"18.56 (16.24, 20.80)","13.59 (11.14, 15.99)","15.74 (13.77, 17.88)","22.53 (20.23, 25.41)","10.26 (8.20, 12.53)"
Median density of effectively domiciled areas (inhabitants/km²),"2737.49 (1979.37, 3600.55)","6650.86 (4888.58, 9000.86)","2477.85 (1713.30, 3641.04)","1174.42 (732.94, 1706.26)","2520.89 (1782.06, 3771.44)"
% urban population,"82.69 (74.07, 90.72)","95.17 (88.42, 99.28)","54.48 (41.14, 65.88)","47.36 (34.06, 58.47)","43.27 (31.43, 53.02)"
Average household size,"2.69 (2.61, 2.78)","2.83 (2.71, 2.96)","2.92 (2.80, 3.05)","2.62 (2.54, 2.71)","3.55 (3.34, 4.02)"
% crowded households,"17.50 (13.50, 22.00)","31.27 (25.54, 38.49)","30.88 (25.62, 36.66)","11.63 (8.24, 15.72)","53.40 (45.70, 65.30)"
% male population,"49.53 (48.89, 50.20)","48.16 (47.53, 48.86)","50.01 (49.21, 50.86)","50.65 (49.99, 51.32)","51.18 (50.34, 51.91)"
% indigenous population,"0.07 (0.03, 0.13)","0.15 (0.09, 0.29)","0.09 (0.03, 0.31)","0.03 (0.00, 0.10)","0.16 (0.05, 8.18)"


In [74]:
df_statistics_sorted.to_csv('data/output/df_sociodemographic_cluster_statistics.csv', index=True)