## Create RAG ratings for the aggregate data

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import math
import numpy as np
import os
import pandas as pd
#from statsmodels.stats.weightstats import DescrStatsW

# Set display options
pd.set_option('display.max_rows', 100)

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Paths to data and files'''
    survey = '../data/survey_data'
    aggregate = 'aggregate_scores.csv'
    rag = 'aggregate_scores_rag.csv'


paths = Paths()

## Import aggregate scores

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.aggregate))
data.head()

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
0,School A,birth_you_age_score,7.861111,108.0,All,All,All,All
1,School B,birth_you_age_score,7.9,110.0,All,All,All,All
2,School C,birth_you_age_score,8.360465,86.0,All,All,All,All
3,School D,birth_you_age_score,8.152174,92.0,All,All,All,All
4,School E,birth_you_age_score,7.652174,92.0,All,All,All,All


## Find weighted mean and SD within each group

In [4]:
def descriptives(values, counts):
    '''
    Calculates:
    * Total sample size (across the schools)
    * Total number of groups without NaN (e.g. schools, areas) used in calc
    * Weighted average of the means
    * Standard deviation of the means
    This normalises weights so they sum 1 (and so they can't all be 0).
    It returns the biased variance and is like a weighted version of np.std().
    For small samples, may want to alter to unbiased variance.
    Based on: https://stackoverflow.com/questions/2413522/weighted-standard-deviation-in-numpy
    Inputs:
    - values - series, to calculate mean and std from
    - counts - series, number of students, used to weight calculations
    Output:
    - result - series with each of the calculations (index is name of calc)
    '''
    # Total sample size
    n_pupils = counts.sum(skipna=True)

    # Total number of groups used in other calc - count number of non-NaN rows
    n_groups = counts.count()

    # Weighted mean
    average = np.average(values, weights=counts)

    # Weighted std
    variance = np.average((values-average)**2, weights=counts)
    std = math.sqrt(variance)

    # Combine into a series
    result = pd.Series(
        [n_pupils, n_groups, average, std],
        index=['total_pupils', 'group_n', 'group_wt_mean', 'group_wt_std'])
    return(result)

In [5]:
groups = ['variable', 'year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
wt_mean = (data
           .groupby(groups)
           .apply(lambda x: descriptives(x['mean'], x['count']))
           .reset_index())
wt_mean

Unnamed: 0,variable,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,accept_score,All,All,All,All,548.0,7.0,10.131387,0.268278
1,accept_score,All,All,All,Non-SEN,286.0,7.0,10.083916,0.386948
2,accept_score,All,All,All,SEN,204.0,6.0,10.299020,0.374254
3,accept_score,All,All,FSM,All,272.0,7.0,10.044118,0.392501
4,accept_score,All,All,Non-FSM,All,243.0,7.0,10.218107,0.320308
...,...,...,...,...,...,...,...,...,...
437,wellbeing_score,All,I describe myself in another way,All,All,55.0,5.0,,
438,wellbeing_score,All,Non-binary,All,All,21.0,2.0,,
439,wellbeing_score,All,Prefer not to say,All,All,44.0,4.0,,
440,wellbeing_score,Year 10,All,All,All,182.0,6.0,20.516484,0.825283


In [6]:
rag = pd.merge(data, wt_mean, how='left', on=groups)
rag

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,School A,birth_you_age_score,7.861111,108.0,All,All,All,All,666.0,7.0,7.912913,0.226426
1,School B,birth_you_age_score,7.900000,110.0,All,All,All,All,666.0,7.0,7.912913,0.226426
2,School C,birth_you_age_score,8.360465,86.0,All,All,All,All,666.0,7.0,7.912913,0.226426
3,School D,birth_you_age_score,8.152174,92.0,All,All,All,All,666.0,7.0,7.912913,0.226426
4,School E,birth_you_age_score,7.652174,92.0,All,All,All,All,666.0,7.0,7.912913,0.226426
...,...,...,...,...,...,...,...,...,...,...,...,...
3021,School E,overall_count,,43.0,All,All,All,SEN,257.0,6.0,,
3022,School F,overall_count,,45.0,All,All,All,Non-SEN,385.0,7.0,,
3023,School F,overall_count,,41.0,All,All,All,SEN,257.0,6.0,,
3024,School G,overall_count,,47.0,All,All,All,Non-SEN,385.0,7.0,,


## Create RAG column based on whether 1SD above or below

<mark>remember the reverse scored things</mark>

In [7]:
# Find 1 SD above and below mean
rag['lower'] = rag['group_wt_mean'] - rag['group_wt_std']
rag['upper'] = rag['group_wt_mean'] + rag['group_wt_std']

In [8]:
# Create RAG column
conditions = [(rag['mean'] <= rag['lower']),
              (rag['mean'] > rag['lower']) & (rag['mean'] < rag['upper']),
              (rag['mean'] >= rag['upper'])]
choices = ['below', 'average', 'above']
rag.loc[:, 'rag'] = np.select(conditions, choices, default=np.nan)

Show some examples

In [9]:
test = rag[
    (rag['variable'] == 'autonomy_score') &
    (rag['year_group_lab'] == 'All') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
7,School A,autonomy_score,17.571429,77.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,below
8,School B,autonomy_score,18.059701,67.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,average
9,School C,autonomy_score,18.320755,53.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,above
10,School D,autonomy_score,17.647887,71.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,average
11,School E,autonomy_score,18.015873,63.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,average
12,School F,autonomy_score,17.759259,54.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,average
13,School G,autonomy_score,17.878788,66.0,All,All,All,All,451.0,7.0,17.873614,0.23801,17.635605,18.111624,average


In [10]:
test = data[
    (data['variable'] == 'accept_score') &
    (data['year_group_lab'] == '10.0') &
    (data['gender_lab'] == 'All') &
    (data['fsm_lab'] == 'All') &
    (data['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab


In [11]:
test = rag[
    (rag['variable'] == 'accept_score') &
    (rag['year_group_lab'] == '10.0') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag


<mark>Can see that, currently, the aggregate_data script has missing rows (as school F had no year 10, it hasn't calculated, but we want it to calculate and say NaN)... or is it ok as is?</mark>

## Set to NaN if not relevant

In [12]:
# Birth you age score is just average birth age and not needed as a "score"
# Overall count is the count of pupils in each group (not for specific variable)
mask = rag['variable'].isin(['birth_you_age_score', 'overall_count'])
nan_col = ['group_n', 'group_wt_mean', 'group_wt_std', 'lower', 'upper', 'rag']
rag.loc[mask, nan_col] = np.nan
rag[mask]

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
0,School A,birth_you_age_score,7.861111,108.0,All,All,All,All,666.0,,,,,,
1,School B,birth_you_age_score,7.900000,110.0,All,All,All,All,666.0,,,,,,
2,School C,birth_you_age_score,8.360465,86.0,All,All,All,All,666.0,,,,,,
3,School D,birth_you_age_score,8.152174,92.0,All,All,All,All,666.0,,,,,,
4,School E,birth_you_age_score,7.652174,92.0,All,All,All,All,666.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021,School E,overall_count,,43.0,All,All,All,SEN,257.0,,,,,,
3022,School F,overall_count,,45.0,All,All,All,Non-SEN,385.0,,,,,,
3023,School F,overall_count,,41.0,All,All,All,SEN,257.0,,,,,,
3024,School G,overall_count,,47.0,All,All,All,Non-SEN,385.0,,,,,,


## Add labels

In [13]:
var_lab = {
    'autonomy_score': 'Autonomy',
    'life_satisfaction_score': 'Life satisfaction',
    'optimism_score': 'Optimism',
    'wellbeing_score': 'Psychological wellbeing',
    'esteem_score': 'Self-esteem',
    'stress_score': 'Stress and coping',
    'appearance_score': 'Feelings around appearance',
    'negative_score': 'Negative affect',
    'lonely_score': 'Loneliness',
    'support_score': 'Supporting own wellbeing',
    'sleep_score': 'Sleep',
    'physical_score': 'Physical activity',
    'free_like_score': 'Free time',
    'media_score': 'Social media use',
    'places_score': 'Places to go and things to do',
    'talk_score': 'Talking about feelings',
    'accept_score': 'Acceptance',
    'school_belong_score': 'School connection',
    'staff_relationship_score': 'Support from staff',
    'home_relationship_score': 'Support from parents/carers',
    'home_happy_score': 'Home environment',
    'local_env_score': 'Local environment',
    'discrim_score': 'Discrimination',
    'belong_local_score': 'Local connection',
    'wealth_score': 'Relative wealth',
    'future_score': 'Future opportunities',
    'climate_score': 'Climate change',
    'social_score': 'Support from friends',
    'bully_score': 'Bullying'
}

In [14]:
# Add label column
rag['variable_lab'] = rag['variable'].map(var_lab)

# View the labels
rag[['variable', 'variable_lab']].drop_duplicates()

Unnamed: 0,variable,variable_lab
0,birth_you_age_score,
7,autonomy_score,Autonomy
14,life_satisfaction_score,Life satisfaction
21,optimism_score,Optimism
28,wellbeing_score,Psychological wellbeing
35,esteem_score,Self-esteem
42,stress_score,Stress and coping
49,appearance_score,Feelings around appearance
56,negative_score,Negative affect
63,lonely_score,Loneliness


## Add descriptions

In [15]:
describe = {
    'autonomy_score': '''How 'in control' young people feel about their lives''',
    'life_satisfaction_score': 'How satisfied young people feel with their life',
    'optimism_score': '''Young people's hopefulness and confidence about the future''',
    'wellbeing_score': 'Extent to which young people feel positive and generally happy with life',
    'esteem_score': 'Extent to which young people value themselves',
    'stress_score': 'Managing stress levels and coping with difficulties',
    'appearance_score': '''Young people's feelings around the way that they look''',
    'negative_score': 'Frequency with which young people experience emotional difficulties',
    'lonely_score': 'How often young people feel lonely',
    'support_score': 'Knowledge about supporting wellbeing and where to look for advice',
    'sleep_score': 'Whether amount of sleep is enough to feel awake and concentrate at school',
    'physical_score': 'How often young people are physically active and for how long',
    'free_like_score': 'How often young people can do things that they like in their free time',
    'media_score': 'Time spent on social media',
    'places_score': '''Whether young people feel there are places to go and things to do in their free time''',
    'talk_score': '''How positively/negatively young people feel about talking with others about feeling down''',
    'accept_score': 'Whether young people feel accepted by different groups of people in their life',
    'school_belong_score': 'Feelings of belonging at school',
    'staff_relationship_score': 'Support received from adults at school',
    'home_relationship_score': 'Support received from adults at home',
    'home_happy_score': 'How happy young people are with the home they live in',
    'local_env_score': 'How young people feel about the area where they live',
    'discrim_score': 'Whether young people feel discriminated against',
    'belong_local_score': '''Young people's feelings of belonging in their local area''',
    'wealth_score': 'Whether young people feel their family is richer, poorer or the same as their friends',
    'future_score': 'How young people feel about the future options for work, education or training in their local area',
    'climate_score': 'Worry about the impact of climate change',
    'social_score': 'Support young people receive from their peers',
    'bully_score': 'Frequency with which young people experience different types of bullying'
}

In [16]:
rag['description'] = rag['variable'].map(describe)
rag[['variable_lab', 'description']].drop_duplicates()

Unnamed: 0,variable_lab,description
0,,
7,Autonomy,How 'in control' young people feel about their...
14,Life satisfaction,How satisfied young people feel with their life
21,Optimism,Young people's hopefulness and confidence abou...
28,Psychological wellbeing,Extent to which young people feel positive and...
35,Self-esteem,Extent to which young people value themselves
42,Stress and coping,Managing stress levels and coping with difficu...
49,Feelings around appearance,Young people's feelings around the way that th...
56,Negative affect,Frequency with which young people experience e...
63,Loneliness,How often young people feel lonely


## Save as csv file

In [17]:
rag.to_csv(os.path.join(paths.survey, paths.rag), index=False)