# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate = 'aggregate_responses.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,
2,2.0,,4.0,,,1.0,1.0,1.0,5.0,4.0,...,,Very helpful,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
3,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,Year 8,Non-FSM,Non-SEN,,Yes,School B


In [4]:
data.columns

Index(['gender', 'transgender', 'sexual_orientation', 'neurodivergent',
       'birth_parent1', 'birth_parent2', 'birth_you', 'birth_you_age',
       'autonomy_pressure', 'autonomy_express',
       ...
       'peer_talk_listen_lab', 'peer_talk_helpful_lab', 'peer_talk_if_lab',
       'accept_peer_lab', 'year_group_lab', 'fsm_lab', 'sen_lab',
       'ethnicity_lab', 'english_additional_lab', 'school_lab'],
      dtype='object', length=281)

## Find the proportion giving each response to each measure, within a given group

In [5]:
# Find columns that we want to count responses for
response_col = [col for col in data.columns if col.endswith('_lab')]
response_col.remove('school_lab')


def aggregate(dataset):
    '''
    Aggregates a given column by finding the proportion giving each answer
    for that column
    Inputs:
    - dataset - dataframe, to use for operation
    '''
    # Initialise list to store the counts for each measure
    responses = []

    # Look through each of the columns of interest
    for col in response_col:

        # Find the name of the numeric version of the column, then filter
        # to just the column and it's numeric version
        numeric = col.replace('_lab', '')

        # Count the number of pupils with each response for that column
        # Then convert to percentages
        df = dataset[[numeric, col]].value_counts(dropna=False).reset_index(name='count')
        df[col] = df[col].fillna('Missing')
        df['percentage'] = round((df['count'] / df['count'].sum()) * 100, 1)

        # Reformat dataframe, sorting by the numeric version of the column,
        # and renaming the categories with generic name
        df = df.sort_values(by=numeric)
        df = df.rename(columns={
            numeric: 'cat',
            col: 'cat_lab'})

        # Convert to series with each column as array and add name of measure
        series = pd.Series(df.to_dict(orient='list'))
        series['measure'] = numeric

        # Count number of non-NaN responses for that column
        series['n_responses'] = dataset[numeric].count()

        # Append to list
        responses.append(series.to_frame().T)
    
    # Combine into a single dataframe and return
    return(pd.concat(responses))

In [6]:
# Create the groups - school alone or combined with a filter
groups = [['school_lab']]
filters = ['year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
for group in filters:
    groups.append(['school_lab'] + [group])
groups

[['school_lab'],
 ['school_lab', 'year_group_lab'],
 ['school_lab', 'gender_lab'],
 ['school_lab', 'fsm_lab'],
 ['school_lab', 'sen_lab']]

In [7]:
res_list = []

# For each of the grouping methods
for grouping in groups:
    # Group the dataframe and loop through those subsets of the dataframe
    for group_name, df_group in data.groupby(grouping):
        # Perform aggregation
        res = aggregate(df_group)
        # Save name of group for that filter (e.g. school = school A)
        for i in range(len(grouping)):
            res[grouping[i]] = group_name[i]
        # Append result to list
        res_list.append(res)

# Combine results from list into a single dataframe
result = pd.concat(res_list)

In [8]:
# Set NaN for the filter labs as All
result[filters] = result[filters].fillna('All')

In [9]:
# Hide results where n<10
result.loc[result['n_responses'] < 10, ['count', 'percentage', 'n_responses']] = np.nan

In [10]:
# Preview dataframe
result

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[14, 24, 18, 17, 12, 16, 11]","[12.5, 21.4, 16.1, 15.2, 10.7, 14.3, 9.8]",gender,101,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Yes, No, Prefer not to say, I describe myself...","[23, 16, 14, 25, 27, 7]","[20.5, 14.3, 12.5, 22.3, 24.1, 6.2]",transgender,105,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[14, 23, 12, 18, 27, 15, 3]","[12.5, 20.5, 10.7, 16.1, 24.1, 13.4, 2.7]",sexual_orientation,109,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, Unsure, Missing]","[42, 29, 30, 11]","[37.5, 25.9, 26.8, 9.8]",neurodivergent,101,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, I don't know, Missing]","[35, 40, 34, 3]","[31.2, 35.7, 30.4, 2.7]",birth_parent1,109,School A,All,All,All,All
...,...,...,...,...,...,...,...,...,...,...,...
0,"[8.0, 10.0]","[Year 8, Year 10]","[22, 19]","[53.7, 46.3]",year_group,41,School G,All,All,All,SEN
0,"[0.0, 1.0, nan]","[Non-FSM, FSM, Missing]","[14, 25, 2]","[34.1, 61.0, 4.9]",fsm,39,School G,All,All,All,SEN
0,[1.0],[SEN],[41],[100.0],sen,41,School G,All,All,All,SEN
0,"[1.0, 2.0, nan]","[Ethnic minority, White British, Missing]","[12, 24, 5]","[29.3, 58.5, 12.2]",ethnicity,36,School G,All,All,All,SEN


## Save to csv

In [11]:
result.to_csv(os.path.join(paths.survey, paths.aggregate), index=False)

## Try plotting from that data

This will not be kept here - but here in the first instance as a test case that the formatting used in aggregation is alright to deal with.

We want to produce a dataframe where each row has a cat_lab, a percent, a measure and a count

<mark>Look at whether the way GM stored their data would allow more efficient extraction</mark>

In [12]:
from ast import literal_eval
import plotly.express as px

In [13]:
res_csv = pd.read_csv(os.path.join(paths.survey, paths.aggregate))
res_csv.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","['Girl', 'Boy', 'Non-binary', 'I describe myse...","[14, 24, 18, 17, 12, 16, 11]","[12.5, 21.4, 16.1, 15.2, 10.7, 14.3, 9.8]",gender,101.0,School A,All,All,All,All
1,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Yes', 'No', 'Prefer not to say', 'I describe...","[23, 16, 14, 25, 27, 7]","[20.5, 14.3, 12.5, 22.3, 24.1, 6.2]",transgender,105.0,School A,All,All,All,All
2,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","['Bi/pansexual', 'Gay/lesbian', 'Heterosexual/...","[14, 23, 12, 18, 27, 15, 3]","[12.5, 20.5, 10.7, 16.1, 24.1, 13.4, 2.7]",sexual_orientation,109.0,School A,All,All,All,All
3,"[1.0, 2.0, 3.0, nan]","['Yes', 'No', 'Unsure', 'Missing']","[42, 29, 30, 11]","[37.5, 25.9, 26.8, 9.8]",neurodivergent,101.0,School A,All,All,All,All
4,"[1.0, 2.0, 3.0, nan]","['Yes', 'No', ""I don't know"", 'Missing']","[35, 40, 34, 3]","[31.2, 35.7, 30.4, 2.7]",birth_parent1,109.0,School A,All,All,All,All


In [14]:
chosen = res_csv[
    (res_csv['measure'].isin(['stress_control', 'stress_overcome',
                             'stress_confident', 'stress_way'])) &
    (res_csv['school_lab'] == 'School A') &
    (res_csv['year_group_lab'] == 'All') &
    (res_csv['gender_lab'] == 'All') &
    (res_csv['fsm_lab'] == 'All') &
    (res_csv['sen_lab'] == 'All')]
chosen

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
31,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[22, 22, 21, 18, 20, 9]","[19.6, 19.6, 18.8, 16.1, 17.9, 8.0]",stress_control,103.0,School A,All,All,All,All
32,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[18, 22, 23, 25, 19, 5]","[16.1, 19.6, 20.5, 22.3, 17.0, 4.5]",stress_overcome,107.0,School A,All,All,All,All
33,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[25, 15, 24, 22, 20, 6]","[22.3, 13.4, 21.4, 19.6, 17.9, 5.4]",stress_confident,106.0,School A,All,All,All,All
34,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[24, 28, 21, 21, 17, 1]","[21.4, 25.0, 18.8, 18.8, 15.2, 0.9]",stress_way,111.0,School A,All,All,All,All


In [15]:
df_list = []
for index, row in chosen.iterrows():
    df = pd.DataFrame(zip(literal_eval(row['cat_lab']),
                          literal_eval(row['percentage']),
                          literal_eval(row['count'])),
                      columns=['cat_lab', 'percentage', 'count'])
    df['measure'] = row['measure']
    df_list.append(df)
chosen_result = pd.concat(df_list)
chosen_result

Unnamed: 0,cat_lab,percentage,count,measure
0,Never,19.6,22,stress_control
1,Almost Never,19.6,22,stress_control
2,Sometimes,18.8,21,stress_control
3,Fairly Often,16.1,18,stress_control
4,Very Often,17.9,20,stress_control
5,Missing,8.0,9,stress_control
0,Never,16.1,18,stress_overcome
1,Almost Never,19.6,22,stress_overcome
2,Sometimes,20.5,23,stress_overcome
3,Fairly Often,22.3,25,stress_overcome


In [16]:
px.bar(chosen_result, x='percentage', y='measure', color='cat_lab',
       text_auto=True, title='Stress', hover_data=['count'], orientation='h')