# Aggregate demographics

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
from kailo_beewell_dashboard.response_labels import (
    create_symbol_response_label_dict)
from kailo_beewell_dashboard.synthesise_aggregate import (
    aggregate_proportions)
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../../data/survey_data'
    synthetic_data = 'symbol_synthetic_data_raw.csv'
    aggregate = 'symbol_nd_aggregate_demographic.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.data, paths.synthetic_data))
data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,...,symbol_school_lab,symbol_free_lab,symbol_life_lab,gender_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,2.0,2,1.0,1.0,3.0,1,1.0,2.0,2.0,3.0,...,Ok,Ok,Sad,Male,Year 11,FSM,SEN,Ethnic minority,Yes,School B
1,,1,1.0,2.0,2.0,3,2.0,,2.0,1.0,...,,Ok,Happy,Female,Year 10,Non-FSM,Non-SEN,Ethnic minority,Yes,School A
2,,1,2.0,2.0,2.0,1,2.0,2.0,1.0,1.0,...,Ok,Happy,Happy,Female,Year 11,FSM,Non-SEN,White British,Yes,School B
3,,2,3.0,,2.0,1,2.0,1.0,2.0,3.0,...,Happy,Ok,Sad,Female,Year 11,Non-FSM,SEN,White British,Yes,School B
4,3.0,1,2.0,2.0,2.0,2,1.0,2.0,2.0,1.0,...,Ok,Ok,Happy,Female,Year 10,FSM,Non-SEN,White British,No,School B


## Aggregate data

In [4]:
# Make list of demographic columns
response_col = [
    'gender', 'year_group', 'fsm', 'sen', 'ethnicity', 'english_additional']

# Import dictionary which has response options for each variable
labels = create_symbol_response_label_dict()

# Add 'NaN': 'No data' to the dictionary
for col in response_col:
    labels[col].update({np.nan: 'No data'})

# Preview two examples
print(labels['fsm'])
print(labels['year_group'])

{0: 'Non-FSM', 1: 'FSM', nan: 'No data'}
{7: 'Year 7', 8: 'Year 8', 9: 'Year 9', 10: 'Year 10', 11: 'Year 11', nan: 'No data'}


Don't need to use aggregate_demographic() as not doing schoool v.s. all other school comparisons - just want to aggregate the entire dataframe, so directly use aggregate_proportions().

In [5]:
# Aggregate whole dataframe for each of the response col provided
result = aggregate_proportions(data=data, response_col=response_col,
                               labels=labels, hide_low_response=True)

Add labels for each measure

In [6]:
# Define labels
measure_labels = {
    'gender': 'Gender',
    'year_group': 'Year group',
    'fsm': 'Free school meals',
    'sen': 'Special educational needs',
    'ethnicity': 'Ethnicity',
    'english_additional': 'English as an additional language'
}

# Add labels
result['measure_lab'] = result['measure'].map(measure_labels)

# View full dataframe
result

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,measure_lab
0,"[0, 1, nan]","[Male, Female, No data]","[18, 32, 10]","[30.0, 53.333333333333336, 16.666666666666664]",gender,60.0,Gender
0,"[7, 8, 9, 10, 11, nan]","[Year 7, Year 8, Year 9, Year 10, Year 11, No ...","[nan, 5, 17, 13, 11, nan]","[nan, 8.333333333333332, 28.333333333333332, 2...",year_group,46.0,Year group
0,"[0, 1, nan]","[Non-FSM, FSM, No data]","[27, 33, 0]","[45.0, 55.00000000000001, 0.0]",fsm,60.0,Free school meals
0,"[0, 1, nan]","[Non-SEN, SEN, No data]","[23, 29, 8]","[38.333333333333336, 48.333333333333336, 13.33...",sen,60.0,Special educational needs
0,"[1, 2, nan]","[Ethnic minority, White British, No data]","[30, 23, 7]","[50.0, 38.333333333333336, 11.666666666666666]",ethnicity,60.0,Ethnicity
0,"[0, 1, nan]","[No, Yes, No data]","[23, 32, 5]","[38.333333333333336, 53.333333333333336, 8.333...",english_additional,60.0,English as an additional language


## Save results to CSV

In [7]:
result.to_csv(os.path.join(paths.data, paths.aggregate),
              index=False, na_rep='NULL')