# Summarizing statistics

In [32]:
import numpy as np
import pandas as pd

## Loading data

In [33]:
df_variables = pd.read_csv("data/output/df_base_without_missing_points.csv", index_col=0) 

In [34]:
df_labeled_cluster = pd.read_csv('data/output/df_labeled_cluster.csv', index_col=0)

## Analysis

### Analysis for national data

In [35]:
df_variables_analysis = df_variables[df_variables.columns[5:]]

In [36]:
statistics = []
for column in df_variables_analysis.columns:
    mean = df_variables_analysis[column].mean()
    std_dev = df_variables_analysis[column].std()
    median = df_variables_analysis[column].median()
    q1 = df_variables_analysis[column].quantile(0.25)
    q3 = df_variables_analysis[column].quantile(0.75)
    statistics.append({
        'Variable': column,
        'Mean (SD)': f"{mean:.2f} ({std_dev:.2f})",
        'Median (Q1, Q3)': f"{median:.2f} ({q1:.2f}, {q3:.2f})"
    })

# Create a new DataFrame with the calculated statistics
df_statistics = pd.DataFrame(statistics)

In [37]:
df_statistics = df_statistics.set_index('Variable')

In [38]:
len(df_statistics)

31

In [39]:
df_statistics.index

Index(['percentage_population_age_range_0_19',
       'percentage_population_age_range_20_39',
       'percentage_population_age_range_40_59',
       'percentage_population_age_range_60_more',
       'percentage_urban_population', 'demographic_density',
       'average_residents_per_households',
       'percentage_population_in_households_more_2_residents_per_bedroom',
       'percentage_male_population', 'percentage_indigenous_population',
       'percentage_black_and_brown_population', 'life_expectancy_at_birth',
       'per_capita_income', 'gini', 'per_capita_cash_transfer_program',
       'percentage_estimated_households_in_informal_settlements',
       'percentage_population_in_informal_settlements',
       'demographic_density_in_informal_settlements',
       'percentage_population_in_households_without_bathroom',
       'percentage_hospitalizations_diseases_inadequate_sanitation',
       'activity_rate', 'percentage_self_employed_workers',
       'unemployment_rate', 'percentage

In [40]:
variable_labels = ['% population 0-19 years',
                   '% population 20-39 years',
                   '% population 40-59 years',
                   '% population 60+ years',
                   '% urban population',
                   'Population density (inhabitants/km²)',
                   'Average household size',
                   '% crowded households',
                   '% male population',
                   '% indigenous population',
                   '% black and brown population',
                   'Life expectancy (years)',
                   'Per capita income (BRL)',
                   'Gini coefficient',
                   'Social transfer per capita (BRL)',
                   '% informal settlement households',
                   '% population in informal settlements',
                   'Population density in informal settlement (inhabitants/ha)',
                   '% households without bathroom',
                   '% sanitation-related hospitalizations',
                   'Activity rate',
                   '% self-employed workers', 
                   'Unemployment rate',
                   '% informal workers',
                   '% poor population spending 1+ hour to work',
                   '% agriculture workers',
                   '% commerce workers', 
                   '% service workers', 
                   '% industry workers',
                   'Illiteracy rate',
                   'Expected years of schooling at age 18']

In [41]:
df_statistics.index = variable_labels

In [42]:
variable_labels

['% population 0-19 years',
 '% population 20-39 years',
 '% population 40-59 years',
 '% population 60+ years',
 '% urban population',
 'Population density (inhabitants/km²)',
 'Average household size',
 '% crowded households',
 '% male population',
 '% indigenous population',
 '% black and brown population',
 'Life expectancy (years)',
 'Per capita income (BRL)',
 'Gini coefficient',
 'Social transfer per capita (BRL)',
 '% informal settlement households',
 '% population in informal settlements',
 'Population density in informal settlement (inhabitants/ha)',
 '% households without bathroom',
 '% sanitation-related hospitalizations',
 'Activity rate',
 '% self-employed workers',
 'Unemployment rate',
 '% informal workers',
 '% poor population spending 1+ hour to work',
 '% agriculture workers',
 '% commerce workers',
 '% service workers',
 '% industry workers',
 'Illiteracy rate',
 'Expected years of schooling at age 18']

In [43]:
df_statistics_sorted = df_statistics.loc[np.concatenate((variable_labels[:4], variable_labels[11:12], variable_labels[4:11], variable_labels[12:]))]

In [44]:
df_statistics_sorted

Unnamed: 0,Mean (SD),"Median (Q1, Q3)"
% population 0-19 years,27.55 (4.85),"26.88 (24.27, 30.12)"
% population 20-39 years,28.76 (2.96),"28.87 (26.96, 30.57)"
% population 40-59 years,26.28 (2.66),"26.72 (24.88, 28.07)"
% population 60+ years,17.43 (4.59),"17.26 (14.33, 20.27)"
Life expectancy (years),73.09 (2.68),"73.47 (71.15, 75.16)"
% urban population,63.81 (22.04),"64.64 (47.07, 82.16)"
Population density (inhabitants/km²),116.04 (596.27),"24.27 (11.34, 53.50)"
Average household size,2.84 (0.32),"2.78 (2.65, 2.95)"
% crowded households,25.13 (12.99),"23.07 (15.41, 32.58)"
% male population,50.00 (1.58),"49.89 (49.07, 50.77)"


In [45]:
df_statistics_sorted.to_csv('data/output/df_sociodemographic_statistics.csv', index=True)

### Analysis by cluster

In [46]:
df_variables['cluster_label'] = df_labeled_cluster['cluster_label']

In [47]:
df_variables_analysis = df_variables[df_variables.columns[5:]]

In [48]:
labels_sorted = ['Urbanized','Urbanized with informal settlements','Semi-urbanized','Rural with high human development','Rural with low human development']

In [49]:
statistics = []
for column in df_variables_analysis.columns[:-1]:
    row = {'Variable': column}
    for cluster in labels_sorted:        
        df_variables_analysis_cluster = df_variables_analysis[df_variables_analysis['cluster_label'] == cluster]
        median = df_variables_analysis_cluster[column].median()
        q1 = df_variables_analysis_cluster[column].quantile(0.25)
        q3 = df_variables_analysis_cluster[column].quantile(0.75)
        row[cluster] = f"{median:.2f} ({q1:.2f}, {q3:.2f})"
    statistics.append(row)

# Create a new DataFrame with the calculated statistics
df_statistics = pd.DataFrame(statistics)

In [50]:
df_statistics = df_statistics.set_index('Variable')

In [51]:
df_statistics.index = variable_labels

In [52]:
df_statistics_sorted = df_statistics.loc[np.concatenate((variable_labels[:4], variable_labels[11:12], variable_labels[4:11], variable_labels[12:]))]

In [53]:
df_statistics_sorted

Unnamed: 0,Urbanized,Urbanized with informal settlements,Semi-urbanized,Rural with high human development,Rural with low human development
% population 0-19 years,"25.12 (23.46, 26.87)","27.80 (25.82, 30.43)","29.63 (27.61, 31.83)","23.10 (21.13, 24.97)","38.23 (35.62, 41.96)"
% population 20-39 years,"28.66 (27.06, 30.35)","31.74 (30.17, 33.37)","29.31 (27.95, 30.70)","25.55 (23.86, 27.18)","30.78 (29.43, 31.95)"
% population 40-59 years,"27.56 (26.68, 28.48)","26.82 (25.09, 27.79)","25.15 (23.91, 26.36)","28.37 (27.38, 29.44)","20.15 (18.38, 21.85)"
% population 60+ years,"18.46 (16.19, 20.76)","13.29 (11.09, 15.90)","15.72 (13.74, 17.83)","22.51 (20.26, 25.38)","10.25 (8.21, 12.52)"
Life expectancy (years),"75.06 (74.01, 76.00)","74.39 (73.23, 75.70)","70.98 (69.68, 72.29)","74.66 (73.57, 75.80)","70.09 (68.47, 71.41)"
% urban population,"82.90 (74.07, 90.82)","96.07 (89.29, 99.72)","54.68 (41.15, 66.28)","47.14 (34.06, 58.35)","43.33 (31.62, 53.58)"
Population density (inhabitants/km²),"32.92 (16.82, 74.94)","463.72 (158.88, 1819.34)","20.59 (8.81, 46.51)","18.77 (11.27, 28.25)","10.62 (2.39, 20.67)"
Average household size,"2.70 (2.61, 2.78)","2.83 (2.71, 2.97)","2.92 (2.80, 3.05)","2.62 (2.54, 2.71)","3.55 (3.34, 4.02)"
% crowded households,"17.59 (13.56, 22.22)","31.84 (26.42, 38.90)","30.91 (25.67, 36.69)","11.81 (8.39, 16.03)","53.43 (45.55, 65.29)"
% male population,"49.52 (48.86, 50.19)","48.22 (47.57, 48.89)","50.00 (49.20, 50.85)","50.65 (50.00, 51.32)","51.18 (50.38, 51.90)"


In [54]:
df_statistics_sorted.to_csv('data/output/df_sociodemographic_cluster_statistics.csv', index=True)