# Aggregate data on scores

Aggregate the synthetic person-level responses to find the mean scores.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate_score = 'aggregate_scores.csv'


paths = Paths()

In [3]:
# Import functions defined elsewhere
import sys
sys.path.append('../')
from utilities.response_labels import create_response_label_dict
from create_and_process_data.functions import results_by_school_and_group

### Import raw data

In [4]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,,8.0,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,,No,School D
2,2.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,,Very helpful,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
3,2.0,5.0,5.0,2.0,,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,Year 8,Non-FSM,Non-SEN,White British,,


## Find mean score by school and pupil group

Find mean score by school:
* Overall
* By year
* By gender (this is currently gender from survey, not council, as can see from GM data extract)
* By FSM
* By SEN

Hide results when count < 10.

In [5]:
# Make a list of the columns that provide a score
score_col = [col for col in data.columns if col.endswith('_score')]

In [6]:
# Define function for finding means and counts
def aggregate(df):
    '''
    Aggregate the score columns in the provided dataset, finding the mean and 
    count of non-NaN
    Inputs:
    - df - dataframe containing the score columns
    Output:
    - res - dataframe with mean and count for each score
    '''
    res = pd.DataFrame({
        # Find mean for each score column, ignoring NaN
        'mean': df[score_col].mean(),
        # Count non-NaN so we know the number of pupils used in the mea
        'count': df[score_col].count()}).rename_axis('variable').reset_index()
    return(res)

In [7]:
# Create version where every question has mean NaN and count 0, to use when a
# school has no pupils of a particular subgroup (i.e. no-one in certain 
# FSM/SEN/gender/year)
no_pupils = aggregate(data)
no_pupils['mean'] = np.nan
no_pupils['count'] = 0
no_pupils.head()

Unnamed: 0,variable,mean,count
0,birth_you_age_score,,0
1,autonomy_score,,0
2,life_satisfaction_score,,0
3,optimism_score,,0
4,wellbeing_score,,0


In [8]:
# Aggregate for each of the possible schools and pupils groups
agg = results_by_school_and_group(
    data=data, agg_func=aggregate, no_pupils=no_pupils)

# Hide results when n<10
agg.loc[agg['count'] < 10, ['mean', 'count']] = np.nan

agg.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,birth_you_age_score,8.105505,109.0,School A,All,All,All,All
1,autonomy_score,17.628571,70.0,School A,All,All,All,All
2,life_satisfaction_score,5.114035,114.0,School A,All,All,All,All
3,optimism_score,12.154762,84.0,School A,All,All,All,All
4,wellbeing_score,21.47541,61.0,School A,All,All,All,All


## Find overall counts for the grouping columns

We have previously found counts who have a score (with counts excluding pupils NaN for a given score). This is to find the overall count of pupils in the dataset for a given school and group, regardless of whether they answered a particular question.

<mark>should the overall counts move to a seperatre notebook and spreadsheet? it is not the same data as the scores, it is for a different function, that might make more sense?</mark>

In [9]:
# Make new version of aggregate that just finds overall counts
def aggregate(df):
    '''
    Aggregates the provided dataframe by finding the total people in it
    '''
    res = pd.DataFrame({
        'variable': ['overall_count'],
        'mean': [np.nan],
        'count': [len(df.index)]
    })
    return(res)

In [10]:
# Make version for when there are no pupils
no_pupils = aggregate(data)
no_pupils['count'] = 0
no_pupils

Unnamed: 0,variable,mean,count
0,overall_count,,0


In [11]:
size = results_by_school_and_group(
    data=data, agg_func=aggregate, no_pupils=no_pupils)

# Preview result (sorted so can see that it matches upwith previous calculation)
size.sort_values(by=['sen_lab', 'year_group_lab', 'gender_lab', 
                     'fsm_lab', 'school_lab'])

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,overall_count,,115,School A,All,All,All,All
0,overall_count,,119,School B,All,All,All,All
0,overall_count,,99,School C,All,All,All,All
0,overall_count,,94,School D,All,All,All,All
0,overall_count,,92,School E,All,All,All,All
...,...,...,...,...,...,...,...,...
0,overall_count,,41,School C,All,All,All,SEN
0,overall_count,,46,School D,All,All,All,SEN
0,overall_count,,46,School E,All,All,All,SEN
0,overall_count,,45,School F,All,All,All,SEN


## Combine and save results

In [12]:
agg_size = pd.concat([agg, size])
agg_size

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,birth_you_age_score,8.105505,109.0,School A,All,All,All,All
1,autonomy_score,17.628571,70.0,School A,All,All,All,All
2,life_satisfaction_score,5.114035,114.0,School A,All,All,All,All
3,optimism_score,12.154762,84.0,School A,All,All,All,All
4,wellbeing_score,21.475410,61.0,School A,All,All,All,All
...,...,...,...,...,...,...,...,...
0,overall_count,,19.0,School G,All,Boy,All,All
0,overall_count,,57.0,School G,All,All,FSM,All
0,overall_count,,37.0,School G,All,All,Non-FSM,All
0,overall_count,,38.0,School G,All,All,All,SEN


In [13]:
agg_size.to_csv(os.path.join(paths.survey, paths.aggregate_score), index=False)