# Aggregate data on scores

Aggregate the synthetic person-level responses to find the mean scores.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate_score = 'aggregate_scores.csv'


paths = Paths()

In [3]:
# Import functions
import sys
sys.path.append('../')
from create_and_process_data.functions import results_by_school_and_group

### Import raw data

In [4]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,,5.0,...,Fully,,,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,,,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School D
2,2.0,3.0,4.0,,1.0,1.0,1.0,1.0,5.0,4.0,...,,,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,,School E
3,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,,,Uncomfortable,Mostly,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,,Not at all,Year 8,Non-FSM,Non-SEN,White British,Yes,School B


## Find mean score by school and pupil group

Find mean score by school:
* Overall
* By year
* By gender (this is currently gender from survey, not council, as can see from GM data extract)
* By FSM
* By SEN

Hide results when count < 10.

In [5]:
# Make a list of the columns that provide a score
score_col = [col for col in data.columns if col.endswith('_score')]

In [6]:
# Define function for finding means and counts
def aggregate_scores(df):
    '''
    Aggregate the score columns in the provided dataset, finding the mean and 
    count of non-NaN

    Parameters:
    -----------
    df : dataframe
        Dataframe with rows for each pupils and containing the score columns

    Returns:
    -------
    res : dataframe
        Dataframe with mean and count for each score
    '''
    res = pd.DataFrame({
        # Find mean for each score column, ignoring NaN
        'mean': df[score_col].mean(),
        # Count non-NaN so we know the number of pupils used in the mea
        'count': df[score_col].count()}).rename_axis('variable').reset_index()
    return(res)

In [7]:
# Create version where every question has mean NaN and count 0, to use when a
# school has no pupils of a particular subgroup (i.e. no-one in certain 
# FSM/SEN/gender/year)
no_pupils = aggregate_scores(data)
no_pupils['mean'] = np.nan
no_pupils['count'] = 0
no_pupils.head()

Unnamed: 0,variable,mean,count
0,autonomy_score,,0
1,life_satisfaction_score,,0
2,optimism_score,,0
3,wellbeing_score,,0
4,esteem_score,,0


In [8]:
# Aggregate for each of the possible schools and pupils groups
agg = results_by_school_and_group(
    data=data, agg_func=aggregate_scores, no_pupils=no_pupils)

# Hide results when n<10
agg.loc[agg['count'] < 10, ['mean', 'count']] = np.nan

agg.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,autonomy_score,17.625,72.0,School A,All,All,All,All
1,life_satisfaction_score,5.111111,117.0,School A,All,All,All,All
2,optimism_score,11.873874,111.0,School A,All,All,All,All
3,wellbeing_score,21.252632,95.0,School A,All,All,All,All
4,esteem_score,12.641975,81.0,School A,All,All,All,All


## Save results

In [9]:
agg.to_csv(os.path.join(paths.survey, paths.aggregate_score), index=False)