# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

<mark>Lots of duplication with processing for standard survey</mark>

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
from kailo_beewell_dashboard.synthesise_responses import (
    add_response_labels, add_topic_groups, aggregate_standard_responses)
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../../data/survey_data'
    synthetic_data = 'standard_synthetic_data_raw_msoa.csv'
    aggregate = 'standard_nd_aggregate_responses.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab,msoa
0,4.0,2.0,6.0,3.0,2.0,,2.0,1.0,2.0,5.0,...,Somewhat helpful,,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,,School E,Torridge 001
1,,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School D,North Devon 001
2,2.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E,North Devon 007
3,2.0,5.0,5.0,,2.0,2.0,,3.0,1.0,2.0,...,,Uncomfortable,,Year 10,Non-FSM,Non-SEN,White British,No,School G,North Devon 006
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Somewhat helpful,,Not at all,,Non-FSM,Non-SEN,White British,Yes,School B,Torridge 003


## Find the proportion giving each response to each measure, within a given group

In [4]:
# Add column for site (we just want to aggregate for whole of Northern Devon)
data['site'] = 'Northern Devon'

In [5]:
result = aggregate_standard_responses(df=data, site_col='site')
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,site,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[140, 124, 144, 152, 146, 94]","[17.5, 15.5, 18.0, 19.0, 18.25, 11.75]",autonomy_pressure,800.0,Northern Devon,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[159, 173, 145, 171, 139, 13]","[19.875, 21.625, 18.125, 21.375, 17.375, 1.625]",autonomy_express,800.0,Northern Devon,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[151, 130, 131, 155, 147, 86]","[18.875, 16.25, 16.375, 19.375, 18.375, 10.75]",autonomy_decide,800.0,Northern Devon,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[155, 142, 131, 131, 147, 94]","[19.375, 17.75, 16.375, 16.375, 18.375, 11.75]",autonomy_told,800.0,Northern Devon,All,All,All,All
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[144, 152, 148, 142, 145, 69]","[18.0, 19.0, 18.5, 17.75, 18.125, 8.625]",autonomy_myself,800.0,Northern Devon,All,All,All,All


## Add groups for each measure

In [6]:
# Add groups and preview
result = add_topic_groups(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,site,year_group_lab,gender_lab,fsm_lab,sen_lab,group
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[140, 124, 144, 152, 146, 94]","[17.5, 15.5, 18.0, 19.0, 18.25, 11.75]",autonomy_pressure,800.0,Northern Devon,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[159, 173, 145, 171, 139, 13]","[19.875, 21.625, 18.125, 21.375, 17.375, 1.625]",autonomy_express,800.0,Northern Devon,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[151, 130, 131, 155, 147, 86]","[18.875, 16.25, 16.375, 19.375, 18.375, 10.75]",autonomy_decide,800.0,Northern Devon,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[155, 142, 131, 131, 147, 94]","[19.375, 17.75, 16.375, 16.375, 18.375, 11.75]",autonomy_told,800.0,Northern Devon,All,All,All,All,autonomy
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[144, 152, 148, 142, 145, 69]","[18.0, 19.0, 18.5, 17.75, 18.125, 8.625]",autonomy_myself,800.0,Northern Devon,All,All,All,All,autonomy


## Add labels for each measure

In [7]:
# Add labels and preview
result = add_response_labels(result)
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,site,year_group_lab,gender_lab,fsm_lab,sen_lab,group,measure_lab
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[140, 124, 144, 152, 146, 94]","[17.5, 15.5, 18.0, 19.0, 18.25, 11.75]",autonomy_pressure,800.0,Northern Devon,All,All,All,All,autonomy,\nI feel pressured in my life
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[159, 173, 145, 171, 139, 13]","[19.875, 21.625, 18.125, 21.375, 17.375, 1.625]",autonomy_express,800.0,Northern Devon,All,All,All,All,autonomy,\nI generally feel free to express my ideas an...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[151, 130, 131, 155, 147, 86]","[18.875, 16.25, 16.375, 19.375, 18.375, 10.75]",autonomy_decide,800.0,Northern Devon,All,All,All,All,autonomy,\nI feel like I am free to decide for myself h...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[155, 142, 131, 131, 147, 94]","[19.375, 17.75, 16.375, 16.375, 18.375, 11.75]",autonomy_told,800.0,Northern Devon,All,All,All,All,autonomy,\nIn my daily life I often have to do what I a...
0,"[1, 2, 3, 4, 5, nan]","[1 - Completely not true, 2, 3, 4, 5 - Complet...","[144, 152, 148, 142, 145, 69]","[18.0, 19.0, 18.5, 17.75, 18.125, 8.625]",autonomy_myself,800.0,Northern Devon,All,All,All,All,autonomy,\nI feel I can pretty much be myself in daily ...


In [8]:
# Show the data types and presence of null
types = result.dtypes
null = result.isnull().any()
pd.DataFrame([types, null]).T

Unnamed: 0,0,1
cat,object,False
cat_lab,object,False
count,object,False
percentage,object,False
measure,object,False
n_responses,float64,False
site,object,False
year_group_lab,object,False
gender_lab,object,False
fsm_lab,object,False


## Save to csv

In [9]:
result.to_csv(os.path.join(paths.survey, paths.aggregate),
              index=False, na_rep='NULL')