# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,,2.0,6.0,3.0,2.0,1.0,2.0,,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,10.0,No,,ethnic_minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,10.0,No,No,ethnic_minority,No,School D
2,,3.0,4.0,,1.0,1.0,1.0,1.0,5.0,4.0,...,Mostly,Very helpful,Very comfortable,Not at all,10.0,No,No,white_british,No,School E
3,,5.0,5.0,2.0,,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,10.0,No,No,white_british,No,School G
4,5.0,3.0,4.0,,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,8.0,No,No,white_british,Yes,School B


## Find the proportion giving each response to each measure, within a given group

<mark>Need to change so this performs for a given group</mark>

In [4]:
# Create the groups - school alone or combined with a filter
groups = [['school_lab']]
filters = ['year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
for group in filters:
    groups.append(['school_lab'] + [group])
groups

[['school_lab'],
 ['school_lab', 'year_group_lab'],
 ['school_lab', 'gender_lab'],
 ['school_lab', 'fsm_lab'],
 ['school_lab', 'sen_lab']]

In [5]:
# Find columns that we want to count responses for
response_col = [col for col in data.columns if col.endswith('_lab')]
response_col.remove('school_lab')


def aggregate():
    '''
    Aggregates a given column by finding the proportion giving each answer
    for that column
    '''
    # Initialise list to store the counts for each measure
    responses = []

    # Look through each of the columns of interest
    for col in response_col:

        # Find the name of the numeric version of the column, then filter
        # to just the column and it's numeric version
        numeric = col.replace('_lab', '')
        df = data[[numeric, col]]

        # Count the number of pupils with each response for that column
        # Then convert to percentages
        df = df.value_counts(dropna=False).reset_index(name='count')
        df['percentage'] = (df['count'] / df['count'].sum()) * 100

        # Reformat dataframe, sorting by the numeric version of the column,
        # and renaming the categories with generic name
        df = df.sort_values(by=numeric)
        df = df.rename(columns={
            numeric: 'cat',
            col: 'cat_lab'})

        # Convert to series with each column as array and add name of measure
        series = pd.Series(df.to_dict(orient='list'))
        series['measure'] = numeric

        # Append to list
        responses.append(series.to_frame().T)
    
    # Combine into a single dataframe and return
    return(pd.concat(responses))

In [6]:
aggregate()

Unnamed: 0,cat,cat_lab,count,percentage,measure
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","[Girl, Boy, Non-binary, I describe myself in a...","[117, 136, 110, 131, 117, 117, 72]","[14.625, 17.0, 13.750000000000002, 16.375, 14....",gender
0,"[1.0, 2.0, 3.0, 4.0, 5.0, nan]","[Yes, No, Prefer not to say, I describe myself...","[139, 139, 149, 151, 144, 78]","[17.375, 17.375, 18.625, 18.875, 18.0, 9.75]",transgender
0,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, nan]","[Bi/pansexual, Gay/lesbian, Heterosexual/strai...","[119, 135, 128, 140, 129, 124, 25]","[14.875, 16.875, 16.0, 17.5, 16.125, 15.5, 3.125]",sexual_orientation
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, Unsure, nan]","[258, 207, 250, 85]","[32.25, 25.874999999999996, 31.25, 10.625]",neurodivergent
0,"[1.0, 2.0, 3.0, nan]","[Yes, No, I don't know, nan]","[242, 258, 245, 55]","[30.25, 32.25, 30.625000000000004, 6.875000000...",birth_parent1
...,...,...,...,...,...
0,"[8.0, 10.0, nan]","[8.0, 10.0, nan]","[436, 348, 16]","[54.50000000000001, 43.5, 2.0]",year_group
0,"[0.0, 1.0, nan]","[No, Yes, nan]","[351, 408, 41]","[43.875, 51.0, 5.125]",fsm
0,"[0.0, 1.0, nan]","[No, Yes, nan]","[415, 304, 81]","[51.87500000000001, 38.0, 10.125]",sen
0,"[1.0, 2.0, nan]","[ethnic_minority, white_british, nan]","[360, 363, 77]","[45.0, 45.375, 9.625]",ethnicity
