# Aggregate data on scores

Aggregate the synthetic person-level responses to find the mean scores.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
from kailo_beewell_dashboard.synthesise_aggregate import (
    aggregate_scores, results_by_site_and_group)
from kailo_beewell_dashboard.synthesise_scores import create_rag_ratings
from kailo_beewell_dashboard.topic_labels import (
    topic_description_dict, topic_name_dict)
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'standard_synthetic_data_raw.csv'
    aggregate_score = 'standard_school_aggregate_scores_rag.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,4.0,2.0,6.0,3.0,2.0,,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,,School E
1,,2.0,1.0,3.0,3.0,2.0,3.0,8.0,4.0,2.0,...,,,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School D
2,2.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,5.0,4.0,...,,,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
3,2.0,5.0,5.0,,2.0,2.0,,3.0,1.0,2.0,...,,,Uncomfortable,,Year 10,Non-FSM,Non-SEN,White British,No,School G
4,5.0,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,,Not at all,,Non-FSM,Non-SEN,White British,Yes,School B


## Find mean score by school and pupil group

Find mean score by school:
* Overall
* By year
* By gender (this is currently gender from survey, not council, as can see from GM data extract)
* By FSM
* By SEN

Hide results when count < 10.

In [4]:
# Create version where every question has mean NaN and count 0, to use when a
# school has no pupils of a particular subgroup (i.e. no-one in certain
# FSM/SEN/gender/year)
no_pupils = aggregate_scores(data)
no_pupils['mean'] = np.nan
no_pupils['count'] = 0
no_pupils.head()

Unnamed: 0,variable,mean,count
0,autonomy_score,,0
1,life_satisfaction_score,,0
2,optimism_score,,0
3,wellbeing_score,,0
4,esteem_score,,0


In [5]:
# Aggregate for each of the possible schools and pupils groups
agg = results_by_site_and_group(
    data=data, agg_func=aggregate_scores, no_pupils=no_pupils)

# Hide results when n<10
agg.loc[agg['count'] < 10, ['mean', 'count']] = np.nan

agg.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
0,autonomy_score,17.184211,76.0,School A,All,All,All,All
1,life_satisfaction_score,5.05042,119.0,School A,All,All,All,All
2,optimism_score,12.035714,112.0,School A,All,All,All,All
3,wellbeing_score,21.215054,93.0,School A,All,All,All,All
4,esteem_score,12.546667,75.0,School A,All,All,All,All


## Add RAG ratings

In [6]:
rag = create_rag_ratings(agg)
rag.head()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
0,autonomy_score,17.184211,76.0,School A,All,All,All,All,490.0,7.0,17.808163,0.439816,17.368347,18.247979,below
1,life_satisfaction_score,5.05042,119.0,School A,All,All,All,All,725.0,7.0,5.06069,0.339266,4.721424,5.399955,average
2,optimism_score,12.035714,112.0,School A,All,All,All,All,731.0,7.0,12.032832,0.260359,11.772473,12.293191,average
3,wellbeing_score,21.215054,93.0,School A,All,All,All,All,547.0,7.0,20.934186,0.507017,20.427169,21.441204,average
4,esteem_score,12.546667,75.0,School A,All,All,All,All,481.0,7.0,12.773389,0.406382,12.367007,13.179771,average


## Add names and descriptions for the topics

For example, "Autonomy" for "autonomy_score".

In [7]:
# Add label column
rag['variable_lab'] = rag['variable'].map(topic_name_dict)

# View the labels
rag[['variable', 'variable_lab']].drop_duplicates()

Unnamed: 0,variable,variable_lab
0,autonomy_score,Autonomy
1,life_satisfaction_score,Life satisfaction
2,optimism_score,Optimism
3,wellbeing_score,Psychological wellbeing
4,esteem_score,Self-esteem
5,stress_score,Stress and coping
6,appearance_score,Feelings around appearance
7,negative_score,Negative affect
8,lonely_score,Loneliness
9,support_score,Supporting own wellbeing


In [8]:
# Add labels to dataframe
rag['description'] = rag['variable'].map(topic_description_dict)

# View each of the topics and their label
rag[['variable_lab', 'description']].drop_duplicates()

Unnamed: 0,variable_lab,description
0,Autonomy,\nHow 'in control' young people feel of their ...
1,Life satisfaction,\nHow satisfied young people feel with their life
2,Optimism,\nYoung people's hopefulness and confidence fo...
3,Psychological wellbeing,\nHow positive and generally happy young peopl...
4,Self-esteem,\nHow much young people value themselves
5,Stress and coping,\nManaging stress levels and coping with diffi...
6,Feelings around appearance,\nYoung people's feelings around the way that ...
7,Negative affect,\nThe frequency with which young people experi...
8,Loneliness,\nHow often young people feel lonely
9,Supporting own wellbeing,\nYoung people's knowledge on supporting thems...


## Save results

In [9]:
rag.to_csv(os.path.join(paths.survey, paths.aggregate_score),
           index=False, na_rep='NULL')