# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from collections import defaultdict
from dataclasses import dataclass
from IPython.display import display
from kailo_beewell_dashboard.synthesise_responses import (
    add_response_labels, add_topic_groups, aggregate_standard_responses)
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'standard_synthetic_data_raw.csv'
    aggregate = 'standard_school_aggregate_responses.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,,School E
1,,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,,,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School D
2,2.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,,,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
3,2.0,5.0,5.0,,2.0,2.0,,3.0,1.0,2.0,...,,,Uncomfortable,,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,,Not at all,,Non-FSM,Non-SEN,White British,Yes,School B


## Find the proportion giving each response to each measure, within a given group

In [4]:
result = aggregate_standard_responses(data, site_col='school_lab')

# Preview head of dataframe
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[20, 27, 22, 21, 22, 16]","[15.625, 21.09375, 17.1875, 16.40625, 17.1875,...",autonomy_pressure,128.0,School A,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 29, 21, 23, 26, 3]","[20.3125, 22.65625, 16.40625, 17.96875, 20.312...",autonomy_express,128.0,School A,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[23, 23, 17, 32, 20, 13]","[17.96875, 17.96875, 13.28125, 25.0, 15.625, 1...",autonomy_decide,128.0,School A,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 24, 15, 30, 24, 9]","[20.3125, 18.75, 11.71875, 23.4375, 18.75, 7.0...",autonomy_told,128.0,School A,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[31, 22, 22, 19, 21, 13]","[24.21875, 17.1875, 17.1875, 14.84375, 16.4062...",autonomy_myself,128.0,School A,All,All,All,All


In [5]:
# Preview school where I set there to be no SEN pupils
# Should show as NaN, as count was 0 which is <10
result[(result['school_lab'] == 'School B') & (result['sen_lab'] == 'SEN')]

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...",,,autonomy_pressure,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...",,,autonomy_express,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...",,,autonomy_decide,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...",,,autonomy_told,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...",,,autonomy_myself,,School B,All,All,All,SEN
...,...,...,...,...,...,...,...,...,...,...,...
0,"[0, 1, nan]","[No, Yes, No response]",,,peer_talk,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, nan]","[Not at all, Slightly, Mostly, Fully, No respo...",,,peer_talk_listen,,School B,All,All,All,SEN
0,"[1, 2, 3, nan]","[Not helpful, Somewhat helpful, Very helpful, ...",,,peer_talk_helpful,,School B,All,All,All,SEN
0,"[1, 2, 3, 4, nan]","[Very uncomfortable, Uncomfortable, Comfortabl...",,,peer_talk_if,,School B,All,All,All,SEN


In [6]:
# Show example of how the branching question is handled.
# Look at the counts in each dataframe.
display(
    data
    .loc[data['school_lab'] == 'School A', ['home_talk', 'home_talk_helpful']]
    .value_counts(dropna=False)
    .sort_index()
    .reset_index())
display(result[result['measure'] == 'home_talk_helpful'].head(1))

Unnamed: 0,home_talk,home_talk_helpful,count
0,0.0,,64
1,1.0,1.0,25
2,1.0,2.0,14
3,1.0,3.0,16
4,1.0,,3
5,,,6


Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1, 2, 3, nan]","[Not helpful, Somewhat helpful, Very helpful, ...","[25, 14, 16, 3]","[43.103448275862064, 24.137931034482758, 27.58...",home_talk_helpful,58.0,School A,All,All,All,All


## Add groups for each measure

In [7]:
# Add groups and preview
result = add_topic_groups(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,group
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[20, 27, 22, 21, 22, 16]","[15.625, 21.09375, 17.1875, 16.40625, 17.1875,...",autonomy_pressure,128.0,School A,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 29, 21, 23, 26, 3]","[20.3125, 22.65625, 16.40625, 17.96875, 20.312...",autonomy_express,128.0,School A,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[23, 23, 17, 32, 20, 13]","[17.96875, 17.96875, 13.28125, 25.0, 15.625, 1...",autonomy_decide,128.0,School A,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 24, 15, 30, 24, 9]","[20.3125, 18.75, 11.71875, 23.4375, 18.75, 7.0...",autonomy_told,128.0,School A,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[31, 22, 22, 19, 21, 13]","[24.21875, 17.1875, 17.1875, 14.84375, 16.4062...",autonomy_myself,128.0,School A,All,All,All,All,autonomy


## Add labels for each measure

In [8]:
# Add labels and preview
result = add_response_labels(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,group,measure_lab
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[20, 27, 22, 21, 22, 16]","[15.625, 21.09375, 17.1875, 16.40625, 17.1875,...",autonomy_pressure,128.0,School A,All,All,All,All,autonomy,\nI feel pressured in my life
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 29, 21, 23, 26, 3]","[20.3125, 22.65625, 16.40625, 17.96875, 20.312...",autonomy_express,128.0,School A,All,All,All,All,autonomy,\nI generally feel free to express my ideas an...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[23, 23, 17, 32, 20, 13]","[17.96875, 17.96875, 13.28125, 25.0, 15.625, 1...",autonomy_decide,128.0,School A,All,All,All,All,autonomy,\nI feel like I am free to decide for myself h...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[26, 24, 15, 30, 24, 9]","[20.3125, 18.75, 11.71875, 23.4375, 18.75, 7.0...",autonomy_told,128.0,School A,All,All,All,All,autonomy,\nIn my daily life I often have to do what I a...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[31, 22, 22, 19, 21, 13]","[24.21875, 17.1875, 17.1875, 14.84375, 16.4062...",autonomy_myself,128.0,School A,All,All,All,All,autonomy,\nI feel I can pretty much be myself in daily ...


In [9]:
# Show the data types and presence of null
types = result.dtypes
null = result.isnull().any()
pd.DataFrame([types, null]).T

Unnamed: 0,0,1
cat,object,False
cat_lab,object,False
count,object,True
percentage,object,True
measure,object,False
n_responses,float64,True
school_lab,object,False
year_group_lab,object,False
gender_lab,object,False
fsm_lab,object,False


## Save to csv

In [10]:
result.to_csv(os.path.join(paths.survey, paths.aggregate),
              index=False, na_rep='NULL')