# Aggregate demographics

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
from kailo_beewell_dashboard.response_labels import create_response_label_dict
from kailo_beewell_dashboard.synthesise_aggregate import aggregate_proportions
from kailo_beewell_dashboard.synthesise_demographic import (
    add_standard_demographic_groups,
    add_standard_demographic_response_labels)
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../../data/survey_data'
    synthetic_data = 'standard_synthetic_data_raw.csv'
    aggregate = 'standard_nd_aggregate_demographic.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,,School E
1,,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,,,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School D
2,2.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,,,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
3,2.0,5.0,5.0,,2.0,2.0,,3.0,1.0,2.0,...,,,Uncomfortable,,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,,Not at all,,Non-FSM,Non-SEN,White British,Yes,School B


## Modify some responses to be n<10

(Not added to package as just some temporary code being used for the synthetic dashboards. Hence, duplicate code with standard school dashboard)

In [4]:
def drop_n_true(boolean_list, n):
    '''
    Drop specified number of 'True' from a list of booleans.

    Parameters
    ----------
    boolean_list : list
        List of True and False values
    n : integer
        Number of True to be replaced with False
    '''
    counter = 0
    to_remove = sum(boolean_list) - n
    # Loop through values of list
    for i in range(len(boolean_list)):
        # If list item is True
        if boolean_list[i]:
            # Increment counter, and set to False if counter is greater than n
            counter += 1
            if counter > to_remove:
                boolean_list[i] = False
    return boolean_list

In [5]:
data.loc[data['school'] ==1, 'transgender'].value_counts(dropna=False).sort_index()

transgender
1.0    24
2.0    24
3.0    20
4.0    29
5.0    29
NaN     2
Name: count, dtype: int64

In [6]:
# Keep 5 responses for category 3
mask = (data['school'] == 1) & (data['transgender'] == 3)
mask = drop_n_true(mask, 5)
data.loc[mask, 'transgender'] = np.nan

# Keep 1 response for category 3
mask = (data['school'] == 1) & (data['transgender'] == 4)
mask = drop_n_true(mask, 1)
data.loc[mask, 'transgender'] = np.nan

# Remove all responses for category 5
mask = (data['school'] == 1) & (data['transgender'] == 5)
data.loc[mask, 'transgender'] = np.nan

data.loc[data['school'] ==1, 'transgender'].value_counts(dropna=False).sort_index()

transgender
1.0    24
2.0    24
3.0     5
4.0     1
NaN    74
Name: count, dtype: int64

## Aggregate data

In [7]:
# Make list of columns that we want to gather responses from
survey_col = ['gender', 'transgender', 'sexual_orientation', 'neurodivergent',
              'birth_parent1', 'birth_parent2', 'birth_you', 'young_carer',
              'care_experience']
council_col = ['year_group', 'fsm', 'sen', 'ethnicity', 'english_additional']
response_col = survey_col + council_col

# Import dictionary which has response options for each variable
labels = create_response_label_dict()

# Add 'NaN': 'No response' to the dictionary for survey columns, and
# 'NaN': 'No data' for the council columns
for col in survey_col:
    labels[col].update({np.nan: 'No response'})
for col in council_col:
    labels[col].update({np.nan: 'No data'})

# Preview two examples
print(labels['birth_parent1'])
print(labels['year_group'])

{1: 'Yes', 2: 'No', 3: "I don't know", nan: 'No response'}
{8: 'Year 8', 10: 'Year 10', nan: 'No data'}


Don't need to use aggregate_demographic() as not doing schoool v.s. all other school comparisons - just want to aggregate the entire dataframe, so directly use aggregate_proportions().

In [8]:
# Aggregate whole dataframe for each of the response col provided
result = aggregate_proportions(data=data, response_col=response_col,
                               labels=labels, hide_low_response=True)
result

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses
0,"[1, 2, 3, 4, 5, 6, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[111, 142, 101, 125, 122, 118, 81]","[13.875000000000002, 17.75, 12.625, 15.625, 15...",gender,800
0,"[1, 2, 3, 4, 5, nan]","[Yes, No, Prefer not to say, I describe myself...","[152, 149, 146, 138, 129, 86]","[19.0, 18.625, 18.25, 17.25, 16.125, 10.75]",transgender,800
0,"[1, 2, 3, 4, 5, 6, nan]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[121, 139, 130, 143, 133, 131, 3]","[15.125, 17.375, 16.25, 17.875, 16.625, 16.375...",sexual_orientation,800
0,"[1, 2, 3, nan]","[Yes, No, Unsure, No response]","[248, 197, 261, 94]","[31.0, 24.625, 32.625, 11.75]",neurodivergent,800
0,"[1, 2, 3, nan]","[Yes, No, I don't know, No response]","[249, 267, 249, 35]","[31.125000000000004, 33.375, 31.12500000000000...",birth_parent1,800
0,"[1, 2, 3, nan]","[Yes, No, I don't know, No response]","[276, 259, 234, 31]","[34.5, 32.375, 29.25, 3.875]",birth_parent2,800
0,"[1, 2, 3, nan]","[Yes, No, I don't know, No response]","[234, 278, 260, 28]","[29.25, 34.75, 32.5, 3.5000000000000004]",birth_you,800
0,"[0, 1, nan]","[No, Yes, No response]","[377, 389, 34]","[47.125, 48.625, 4.25]",young_carer,800
0,"[1, 0, 2, nan]","[Yes, No, Unsure, No response]","[249, 217, 245, 89]","[31.125000000000004, 27.125, 30.62500000000000...",care_experience,800
0,"[8, 10, nan]","[Year 8, Year 10, No data]","[416, 312, 72]","[52.0, 39.0, 9.0]",year_group,800


## Add labels and groups

In [9]:
result = add_standard_demographic_groups(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,plot_group
0,"[1, 2, 3, 4, 5, 6, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[111, 142, 101, 125, 122, 118, 81]","[13.875000000000002, 17.75, 12.625, 15.625, 15...",gender,800,gender
0,"[1, 2, 3, 4, 5, nan]","[Yes, No, Prefer not to say, I describe myself...","[152, 149, 146, 138, 129, 86]","[19.0, 18.625, 18.25, 17.25, 16.125, 10.75]",transgender,800,gender
0,"[1, 2, 3, 4, 5, 6, nan]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[121, 139, 130, 143, 133, 131, 3]","[15.125, 17.375, 16.25, 17.875, 16.625, 16.375...",sexual_orientation,800,sexual_orientation
0,"[1, 2, 3, nan]","[Yes, No, Unsure, No response]","[248, 197, 261, 94]","[31.0, 24.625, 32.625, 11.75]",neurodivergent,800,neuro
0,"[1, 2, 3, nan]","[Yes, No, I don't know, No response]","[249, 267, 249, 35]","[31.125000000000004, 33.375, 31.12500000000000...",birth_parent1,800,birth


In [10]:
result = add_standard_demographic_response_labels(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,plot_group,measure_lab
0,"[1, 2, 3, 4, 5, 6, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[111, 142, 101, 125, 122, 118, 81]","[13.875000000000002, 17.75, 12.625, 15.625, 15...",gender,800,gender,Gender
0,"[1, 2, 3, 4, 5, nan]","[Yes, No, Prefer not to say, I describe myself...","[152, 149, 146, 138, 129, 86]","[19.0, 18.625, 18.25, 17.25, 16.125, 10.75]",transgender,800,gender,Do you consider yourself to be transgender?
0,"[1, 2, 3, 4, 5, 6, nan]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[121, 139, 130, 143, 133, 131, 3]","[15.125, 17.375, 16.25, 17.875, 16.625, 16.375...",sexual_orientation,800,sexual_orientation,Sexual orientation
0,"[1, 2, 3, nan]","[Yes, No, Unsure, No response]","[248, 197, 261, 94]","[31.0, 24.625, 32.625, 11.75]",neurodivergent,800,neuro,Do you identify as neurodivergent?
0,"[1, 2, 3, nan]","[Yes, No, I don't know, No response]","[249, 267, 249, 35]","[31.125000000000004, 33.375, 31.12500000000000...",birth_parent1,800,birth,Was birth parent 1 born outside the UK?


## Save results to csv

In [11]:
result.to_csv(os.path.join(paths.survey, paths.aggregate),
              index=False, na_rep='NULL')