# Aggregate data on scores

Aggregate the synthetic person-level responses to find the mean scores.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate_score = 'aggregate_scores.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,1.0,2.0,,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School E
1,,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,,No,School D
2,,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,Mostly,Very helpful,Very comfortable,Not at all,Year 10,Non-FSM,,,,School E
3,,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,Year 8,,Non-SEN,White British,Yes,


## Find mean score per school

Find mean score by school:
* Overall
* By year
* By gender (this is gender from survey, not council, as can see from GM data extract)
* By FSM
* By SEN

Hide results when count < 10.

In [4]:
# Make a list of the columns that provide a score
score_col = [col for col in data.columns if col.endswith('_score')]

# Preview the columns
score_col

['birth_you_age_score',
 'autonomy_score',
 'life_satisfaction_score',
 'optimism_score',
 'wellbeing_score',
 'esteem_score',
 'stress_score',
 'appearance_score',
 'negative_score',
 'lonely_score',
 'support_score',
 'sleep_score',
 'physical_score',
 'free_like_score',
 'media_score',
 'places_score',
 'staff_talk_score',
 'home_talk_score',
 'peer_talk_score',
 'talk_score',
 'accept_score',
 'school_belong_score',
 'staff_relationship_score',
 'home_relationship_score',
 'home_happy_score',
 'local_env_score',
 'discrim_score',
 'belong_local_score',
 'wealth_score',
 'future_score',
 'climate_score',
 'social_score',
 'bully_score']

In [5]:
# Define the groups that we want to aggregate by - when providing a filter,
# the first value is the name of the category and the second is the variable.
# This is used instead of groupby so that we still look for scores in cases
# where there are no instances of a particular group
groups = [
    'All',
    ['Year 8', 'year_group_lab'],
    ['Year 10', 'year_group_lab'],
    ['Girl', 'gender_lab'],
    ['Boy', 'gender_lab'],
    ['FSM', 'fsm_lab'],
    ['Non-FSM', 'fsm_lab'],
    ['SEN', 'sen_lab'],
    ['Non-SEN', 'sen_lab']
]

In [6]:
def aggregate(df):
    '''
    Aggregate the score columns in the provided dataset, finding the mean and 
    count of non-NaN
    Inputs:
    - df - dataframe containing the score columns
    Output:
    - res - dataframe with mean and count for each score
    '''
    res = pd.DataFrame({
        # Find mean for each score column, ignoring NaN
        'mean': df[score_col].mean(),
        # Count non-NaN so we know the number of pupils used in the mea
        'count': df[score_col].count()}).rename_axis('variable').reset_index()
    return(res)

In [7]:
# Create version where every question has mean NaN and count 0, to use when a x
# when a school has no pupils of a particular subgroup (i.e. no-one in
# certain FSM/SEN/gender/year)
no_pupils = aggregate(data)
no_pupils['mean'] = np.nan
no_pupils['count'] = 0
no_pupils.head()

Unnamed: 0,variable,mean,count
0,birth_you_age_score,,0
1,autonomy_score,,0
2,life_satisfaction_score,,0
3,optimism_score,,0
4,wellbeing_score,,0


In [8]:
# Initialise list to store results
result_list = list()

# For each of the schools (which we know will all be present at least once
# as we base the school list on the dataset itself)
schools = data['school_lab'].dropna().drop_duplicates().sort_values()
for school in schools:
    # For each the groupings
    for group in groups:

        # Find results for that school. If group is not equal to all,
        # then apply additional filters
        to_agg = data[data['school_lab'] == school]
        if group != 'All':
            to_agg = to_agg[to_agg[group[1]] == group[0]]

        # If the dataframe is empty (i.e. you applied a filter but there
        # were no students matching that filter) then set to the no_pupils df.
        # Otherwise, just aggregate the data
        if len(to_agg.index) == 0:
            res = no_pupils.copy()
        else:
            res = aggregate(to_agg)

        # Specify what school it was
        res['school_lab'] = school

        # Set each group as all, but replace the relevant one if filter used
        res['year_group_lab'] = 'All'
        res['gender_lab'] = 'All'
        res['fsm_lab'] = 'All'
        res['sen_lab'] = 'All'
        if group != 'All':
            res[group[1]] = group[0]

        # Append results to list
        result_list.append(res)

# Combine all the results into a single dataframe
agg = pd.concat(result_list)

# Hide results when n<10
agg.loc[agg['count'] < 10, ['mean', 'count']] = np.nan

# Preview result
agg.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,birth_you_age_score,7.966667,105.0,School A,All,All,All,All
1,autonomy_score,17.5,72.0,School A,All,All,All,All
2,life_satisfaction_score,5.166667,114.0,School A,All,All,All,All
3,optimism_score,11.852273,88.0,School A,All,All,All,All
4,wellbeing_score,21.0,61.0,School A,All,All,All,All


## Find overall counts for the grouping columns

We have previously found counts who have a score (with counts excluding pupils NaN for a given score). This is to find the overall count of pupils in the dataset for a given school and group, regardless of whether they answered a particular question.

<mark>code duplication twice here and also in aggregate_responses - need to define this function seperately</mark>

<mark>should the overall counts move to a seperatre notebook and spreadsheet? it is not the same data as the scores, it is for a different function, that might make more sense?</mark>

In [9]:
# Make new version of aggregate that just finds overall counts
def aggregate(df):
    '''
    Aggregates the provided dataframe by finding the total people in it
    '''
    res = pd.DataFrame({
        'variable': ['overall_count'],
        'mean': [np.nan],
        'count': [len(df.index)]
    })
    return(res)

In [10]:
# Make version for when there are no pupils
no_pupils = aggregate(data)
no_pupils['count'] = 0
no_pupils

Unnamed: 0,variable,mean,count
0,overall_count,,0


In [11]:
# Initialise list to store results
result_list = list()

# For each of the schools (which we know will all be present at least once
# as we base the school list on the dataset itself)
schools = data['school_lab'].dropna().drop_duplicates().sort_values()
for school in schools:
    # For each the groupings
    for group in groups:

        # Find results for that school. If group is not equal to all,
        # then apply additional filters
        to_agg = data[data['school_lab'] == school]
        if group != 'All':
            to_agg = to_agg[to_agg[group[1]] == group[0]]

        # If the dataframe is empty (i.e. you applied a filter but there
        # were no students matching that filter) then set to the no_pupils df.
        # Otherwise, just aggregate the data
        if len(to_agg.index) == 0:
            res = no_pupils.copy()
        else:
            res = aggregate(to_agg)

        # Specify what school it was
        res['school_lab'] = school

        # Set each group as all, but replace the relevant one if filter used
        res['year_group_lab'] = 'All'
        res['gender_lab'] = 'All'
        res['fsm_lab'] = 'All'
        res['sen_lab'] = 'All'
        if group != 'All':
            res[group[1]] = group[0]

        # Append results to list
        result_list.append(res)

# Combine all the results into a single dataframe
size = pd.concat(result_list)

# Hide results when n<10
size.loc[size['count'] < 10, ['mean', 'count']] = np.nan

# Preview result (sorted so can see that it matches upwith previous calculation)
size.sort_values(by=['sen_lab', 'year_group_lab', 'gender_lab', 
                     'fsm_lab', 'school_lab'])

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,overall_count,,115.0,School A,All,All,All,All
0,overall_count,,120.0,School B,All,All,All,All
0,overall_count,,98.0,School C,All,All,All,All
0,overall_count,,98.0,School D,All,All,All,All
0,overall_count,,94.0,School E,All,All,All,All
...,...,...,...,...,...,...,...,...
0,overall_count,,39.0,School C,All,All,All,SEN
0,overall_count,,42.0,School D,All,All,All,SEN
0,overall_count,,44.0,School E,All,All,All,SEN
0,overall_count,,45.0,School F,All,All,All,SEN


## Combine and save results

In [12]:
agg_size = pd.concat([agg, size])
agg_size

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,birth_you_age_score,7.966667,105.0,School A,All,All,All,All
1,autonomy_score,17.500000,72.0,School A,All,All,All,All
2,life_satisfaction_score,5.166667,114.0,School A,All,All,All,All
3,optimism_score,11.852273,88.0,School A,All,All,All,All
4,wellbeing_score,21.000000,61.0,School A,All,All,All,All
...,...,...,...,...,...,...,...,...
0,overall_count,,19.0,School G,All,Boy,All,All
0,overall_count,,58.0,School G,All,All,FSM,All
0,overall_count,,32.0,School G,All,All,Non-FSM,All
0,overall_count,,43.0,School G,All,All,All,SEN


In [13]:
agg_size.to_csv(os.path.join(paths.survey, paths.aggregate_score), index=False)