# Aggregate data

Aggregate the synthetic person-level responses by school.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = 'data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate_score = 'aggregate_scores.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,10.0,No,No,ethnic_minority,No,School E
1,1.0,2.0,1.0,3.0,3.0,2.0,,,4.0,2.0,...,,Very helpful,Very uncomfortable,Slightly,10.0,No,No,ethnic_minority,No,School D
2,2.0,,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,Mostly,Very helpful,Very comfortable,Not at all,10.0,No,No,white_british,No,School E
3,2.0,5.0,5.0,,2.0,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,,10.0,No,No,white_british,No,School G
4,5.0,,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,8.0,No,No,white_british,Yes,School B


## Find mean score per school

Find mean score by school:
* Overall
* By year
* By gender (this is gender from survey, not council, as can see from GM data extract)
* By FSM
* By SEN

Hide results when count < 10.

<mark>Does this mean you always have all the categories? Or is there an issue here that it groupby but there's only one group present so it doesn't show as na for other group and just provides it for one group?</mark>

In [4]:
# Find columns that provide a score
score_col = [col for col in data.columns if col.endswith('_score')]


def aggregate(calc, group_col):
    '''
    Aggregate the score columns of data, using provided group, either finding
    the number of non-NaN or the mean (ignoring NaN)
    Inputs:
    - calc: 'mean' or 'count'
    - group_col: list of columns to groupby
    Output:
    - result: dataframe with mean or count for each score + group as each row
    '''
    # Find the mean of each score, ignoring NaN
    if calc == 'mean':
        result = data[group_col + score_col].groupby(group_col).mean()
    # Count non-NaN, so know what was used in mean, as there are pupils
    # missing data for that score or characteristic (e.g. year group)
    elif calc == 'count':
        result = data[group_col + score_col].groupby(group_col).count()

    # Transform from wide to long
    result = pd.melt(result.reset_index(), id_vars=group_col)

    # Rename the value column to the use calculation
    result = result.rename(columns={'value': calc})

    return(result)


def mean_and_count(group_col):
    '''
    Uses the aggregate() function to find the mean and counts for each score
    column when grouped by the provided columns
    Inputs:
    - group_col: list of columns to groupby
    Outputs:
    - result: dataframe with mean and count for each score + group as each row
    '''
    # Find mean and count
    res_mean = aggregate('mean', group_col)
    res_count = aggregate('count', group_col)

    # Combine the results dataframe - using outer (all ID) rather than inner
    # (only matching ID) - although all columns should match
    result = pd.merge(left=res_mean, right=res_count, how='outer')

    # Hide results when n<10
    result.loc[result['count'] < 10, ['mean', 'count']] = np.nan

    return(result)

In [5]:
# Aggregate by various groups
agg = pd.concat(
    [mean_and_count(['school_lab']),
     mean_and_count(['school_lab', 'year_group_lab']),
     mean_and_count(['school_lab', 'gender_lab']),
     mean_and_count(['school_lab', 'fsm_lab']),
     mean_and_count(['school_lab', 'sen_lab'])])

# Set NaN for the lab as All
lab_col = ['year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
agg[lab_col] = agg[lab_col].fillna('All')

# View result
agg

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
0,School A,birth_you_age_score,8.235849,106.0,All,All,All,All
1,School B,birth_you_age_score,7.815789,114.0,All,All,All,All
2,School C,birth_you_age_score,8.755556,90.0,All,All,All,All
3,School D,birth_you_age_score,8.005263,95.0,All,All,All,All
4,School E,birth_you_age_score,7.582353,85.0,All,All,All,All
...,...,...,...,...,...,...,...,...
443,School E,bully_score,7.840909,44.0,All,All,All,Yes
444,School F,bully_score,7.162162,37.0,All,All,All,No
445,School F,bully_score,7.800000,40.0,All,All,All,Yes
446,School G,bully_score,7.700000,40.0,All,All,All,No


## Save result

In [6]:
agg.to_csv(os.path.join(paths.survey, paths.aggregate_score), index=False)