# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate = 'aggregate_responses.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,,2.0,6.0,3.0,2.0,1.0,2.0,,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,10.0,No,,ethnic_minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,10.0,No,No,ethnic_minority,No,School D
2,,3.0,4.0,,1.0,1.0,1.0,1.0,5.0,4.0,...,Mostly,Very helpful,Very comfortable,Not at all,10.0,No,No,white_british,No,School E
3,,5.0,5.0,2.0,,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,10.0,No,No,white_british,No,School G
4,5.0,3.0,4.0,,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,8.0,No,No,white_british,Yes,School B


## Find the proportion giving each response to each measure, within a given group

In [4]:
# Find columns that we want to count responses for
response_col = [col for col in data.columns if col.endswith('_lab')]
response_col.remove('school_lab')


def aggregate(dataset):
    '''
    Aggregates a given column by finding the proportion giving each answer
    for that column
    Inputs:
    - dataset - dataframe, to use for operation
    '''
    # Initialise list to store the counts for each measure
    responses = []

    # Look through each of the columns of interest
    for col in response_col:

        # Find the name of the numeric version of the column, then filter
        # to just the column and it's numeric version
        numeric = col.replace('_lab', '')

        # Count the number of pupils with each response for that column
        # Then convert to percentages
        df = dataset[[numeric, col]].value_counts(dropna=False).reset_index(name='count')
        df[col] = df[col].fillna('Missing')
        df['percentage'] = round((df['count'] / df['count'].sum()) * 100, 1)

        # Reformat dataframe, sorting by the numeric version of the column,
        # and renaming the categories with generic name
        df = df.sort_values(by=numeric)
        df = df.rename(columns={
            numeric: 'cat',
            col: 'cat_lab'})

        # Convert to series with each column as array and add name of measure
        series = pd.Series(df.to_dict(orient='list'))
        series['measure'] = numeric

        # Count number of non-NaN responses for that column
        series['n_responses'] = dataset[numeric].count()

        # Append to list
        responses.append(series.to_frame().T)
    
    # Combine into a single dataframe and return
    return(pd.concat(responses))

In [5]:
# Create the groups - school alone or combined with a filter
groups = [['school_lab']]
filters = ['year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
for group in filters:
    groups.append(['school_lab'] + [group])
groups

[['school_lab'],
 ['school_lab', 'year_group_lab'],
 ['school_lab', 'gender_lab'],
 ['school_lab', 'fsm_lab'],
 ['school_lab', 'sen_lab']]

In [6]:
res_list = []

# For each of the grouping methods
for grouping in groups:
    # Group the dataframe and loop through those subsets of the dataframe
    for group_name, df_group in data.groupby(grouping):
        # Perform aggregation
        res = aggregate(df_group)
        # Save name of group for that filter (e.g. school = school A)
        for i in range(len(grouping)):
            res[grouping[i]] = group_name[i]
        # Append result to list
        res_list.append(res)

# Combine results from list into a single dataframe
result = pd.concat(res_list)

In [7]:
# Set NaN for the filter labs as All
result[filters] = result[filters].fillna('All')

In [8]:
# Hide results where n<10
result.loc[result['n_responses'] < 10, ['count', 'percentage', 'n_responses']] = np.nan

In [9]:
# Preview dataframe
result

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[16, 23, 19, 21, 11, 16, 10]","[13.8, 19.8, 16.4, 18.1, 9.5, 13.8, 8.6]",gender,106,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Yes, No, Prefer not to say, I describe myself...","[20, 22, 17, 23, 24, 10]","[17.2, 19.0, 14.7, 19.8, 20.7, 8.6]",transgender,106,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[21, 25, 13, 17, 25, 15]","[18.1, 21.6, 11.2, 14.7, 21.6, 12.9]",sexual_orientation,116,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, Unsure, Missing]","[46, 30, 33, 7]","[39.7, 25.9, 28.4, 6.0]",neurodivergent,109,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, I don't know, Missing]","[35, 38, 35, 8]","[30.2, 32.8, 30.2, 6.9]",birth_parent1,108,School A,All,All,All,All
...,...,...,...,...,...,...,...,...,...,...,...
0,"[8.0, 10.0, nan]","[8.0, 10.0, Missing]","[21, 16, 2]","[53.8, 41.0, 5.1]",year_group,37,School G,All,All,All,Yes
0,"[0.0, 1.0, nan]","[No, Yes, Missing]","[14, 23, 2]","[35.9, 59.0, 5.1]",fsm,37,School G,All,All,All,Yes
0,[1.0],[Yes],[39],[100.0],sen,39,School G,All,All,All,Yes
0,"[1.0, 2.0, nan]","[ethnic_minority, white_british, Missing]","[14, 20, 5]","[35.9, 51.3, 12.8]",ethnicity,34,School G,All,All,All,Yes


## Save to csv

In [10]:
result.to_csv(os.path.join(paths.survey, paths.aggregate), index=False)

## Try plotting from that data

In [11]:
chosen = result[
    (result['measure'].isin(['stress_control', 'stress_overcome',
                             'stress_confident', 'stress_way'])) &
    (result['school_lab'] == 'School A') &
    (result['year_group_lab'] == 'All') &
    (result['gender_lab'] == 'All') &
    (result['fsm_lab'] == 'All') &
    (result['sen_lab'] == 'All')]
chosen

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Never, Almost Never, Sometimes, Fairly Often,...","[23, 24, 18, 18, 23, 10]","[19.8, 20.7, 15.5, 15.5, 19.8, 8.6]",stress_control,106,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Never, Almost Never, Sometimes, Fairly Often,...","[19, 23, 21, 26, 22, 5]","[16.4, 19.8, 18.1, 22.4, 19.0, 4.3]",stress_overcome,111,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Never, Almost Never, Sometimes, Fairly Often,...","[25, 15, 25, 24, 20, 7]","[21.6, 12.9, 21.6, 20.7, 17.2, 6.0]",stress_confident,109,School A,All,All,All,All
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Never, Almost Never, Sometimes, Fairly Often,...","[25, 32, 17, 20, 21, 1]","[21.6, 27.6, 14.7, 17.2, 18.1, 0.9]",stress_way,115,School A,All,All,All,All


We want to produce a dataframe where each row has a cat_lab, a percent, a measure and a count

<mark>Look at whether the way GM stored their data would allow more efficient extraction</mark>

In [12]:
df_list = []
for index, row in chosen.iterrows():
    df = pd.DataFrame(zip(row['cat_lab'], row['percentage'], row['count']),
                          columns=['cat_lab', 'percentage', 'count'])
    df['measure'] = row['measure']
    df_list.append(df)
chosen_result = pd.concat(df_list)
chosen_result

Unnamed: 0,cat_lab,percentage,count,measure
0,Never,19.8,23,stress_control
1,Almost Never,20.7,24,stress_control
2,Sometimes,15.5,18,stress_control
3,Fairly Often,15.5,18,stress_control
4,Very Often,19.8,23,stress_control
5,Missing,8.6,10,stress_control
0,Never,16.4,19,stress_overcome
1,Almost Never,19.8,23,stress_overcome
2,Sometimes,18.1,21,stress_overcome
3,Fairly Often,22.4,26,stress_overcome


In [13]:
import plotly.express as px

<mark>Missing is currently just removed from the chart, will need to save it as a string</mark>

In [14]:
px.bar(chosen_result, x='percentage', y='measure', color='cat_lab',
       text_auto=True, title='Stress', hover_data=['count'], orientation='h')

Below is using the csv (rather than the python object)...

In [15]:
res_csv = pd.read_csv(os.path.join(paths.survey, paths.aggregate))
res_csv.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","['Girl', 'Boy', 'Non-binary', 'I describe myse...","[16, 23, 19, 21, 11, 16, 10]","[13.8, 19.8, 16.4, 18.1, 9.5, 13.8, 8.6]",gender,106.0,School A,All,All,All,All
1,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Yes', 'No', 'Prefer not to say', 'I describe...","[20, 22, 17, 23, 24, 10]","[17.2, 19.0, 14.7, 19.8, 20.7, 8.6]",transgender,106.0,School A,All,All,All,All
2,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]","['Bi/pansexual', 'Gay/lesbian', 'Heterosexual/...","[21, 25, 13, 17, 25, 15]","[18.1, 21.6, 11.2, 14.7, 21.6, 12.9]",sexual_orientation,116.0,School A,All,All,All,All
3,"[1.0, 2.0, 3.0, nan]","['Yes', 'No', 'Unsure', 'Missing']","[46, 30, 33, 7]","[39.7, 25.9, 28.4, 6.0]",neurodivergent,109.0,School A,All,All,All,All
4,"[1.0, 2.0, 3.0, nan]","['Yes', 'No', ""I don't know"", 'Missing']","[35, 38, 35, 8]","[30.2, 32.8, 30.2, 6.9]",birth_parent1,108.0,School A,All,All,All,All


In [16]:
chosen = res_csv[
    (res_csv['measure'].isin(['stress_control', 'stress_overcome',
                             'stress_confident', 'stress_way'])) &
    (res_csv['school_lab'] == 'School A') &
    (res_csv['year_group_lab'] == 'All') &
    (res_csv['gender_lab'] == 'All') &
    (res_csv['fsm_lab'] == 'All') &
    (res_csv['sen_lab'] == 'All')]
chosen

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
31,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[23, 24, 18, 18, 23, 10]","[19.8, 20.7, 15.5, 15.5, 19.8, 8.6]",stress_control,106.0,School A,All,All,All,All
32,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[19, 23, 21, 26, 22, 5]","[16.4, 19.8, 18.1, 22.4, 19.0, 4.3]",stress_overcome,111.0,School A,All,All,All,All
33,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[25, 15, 25, 24, 20, 7]","[21.6, 12.9, 21.6, 20.7, 17.2, 6.0]",stress_confident,109.0,School A,All,All,All,All
34,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","['Never', 'Almost Never', 'Sometimes', 'Fairly...","[25, 32, 17, 20, 21, 1]","[21.6, 27.6, 14.7, 17.2, 18.1, 0.9]",stress_way,115.0,School A,All,All,All,All


In [17]:
from ast import literal_eval

<mark>Need to apply the above to csv instead of the python object which it is currently</mark>

In [22]:
for index, row in chosen.iterrows():
    cat = literal_eval(row['cat_lab'])
    per = literal_eval(row['percentage'])
    display(cat)
    display(per)

['Never', 'Almost Never', 'Sometimes', 'Fairly Often', 'Very Often', 'Missing']

[19.8, 20.7, 15.5, 15.5, 19.8, 8.6]

['Never', 'Almost Never', 'Sometimes', 'Fairly Often', 'Very Often', 'Missing']

[16.4, 19.8, 18.1, 22.4, 19.0, 4.3]

['Never', 'Almost Never', 'Sometimes', 'Fairly Often', 'Very Often', 'Missing']

[21.6, 12.9, 21.6, 20.7, 17.2, 6.0]

['Never', 'Almost Never', 'Sometimes', 'Fairly Often', 'Very Often', 'Missing']

[21.6, 27.6, 14.7, 17.2, 18.1, 0.9]