## Create RAG ratings for the aggregate data

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import math
import numpy as np
import os
import pandas as pd
#from statsmodels.stats.weightstats import DescrStatsW

# Set display options
pd.set_option('display.max_rows', 100)

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Paths to data and files'''
    survey = '../data/survey_data'
    aggregate = 'aggregate_scores.csv'
    rag = 'aggregate_scores_rag.csv'


paths = Paths()

## Import aggregate scores

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.aggregate))

# Preview dataframe
data.tail()

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab
2011,wealth_score,0.3,30.0,School G,All,All,All,Non-SEN
2012,future_score,7.461538,13.0,School G,All,All,All,Non-SEN
2013,climate_score,2.27451,51.0,School G,All,All,All,Non-SEN
2014,social_score,11.891892,37.0,School G,All,All,All,Non-SEN
2015,bully_score,7.3,50.0,School G,All,All,All,Non-SEN


## Find weighted mean and SD within each group

Define function for calculation weighted mean and weighted standard deviation.

In [4]:
def descriptives(values, counts):
    '''
    This function uses school-level data. Using the mean and count from each
    group, it calculates the weighted mean and weighted standard deviation
    of scores across the groups. It returns this, alongside a count of the
    pupils and groups included.

    Additional information about weighted standard deviation:
    This normalises weights so they sum 1 (and so they can't all be 0).
    It returns the biased variance and is like a weighted version of np.std().
    For small samples, may want to alter to unbiased variance.
    Based on: https://stackoverflow.com/questions/2413522/weighted-standard-deviation-in-numpy

    Parameters
    ----------
    values : pandas series
        Dataframe column with the mean scores in each group, NaN removed
    counts: pandas series
        Dataframe column with the count of pupils in each group, NaN removed

    Returns
    -------
    result : pandas Series
        Series with each of the calculations, where index if the name of the
        calculation
    '''
    # Check for NaN
    if values.isnull().any():
        raise ValueError('There must be no NaN in the values column.')
    if counts.isnull().any():
        raise ValueError('There must be no NaN in the counts column.')

    # Weighted mean
    average = np.average(values, weights=counts)
    # Weighted std
    variance = np.average((values-average)**2, weights=counts)
    std = math.sqrt(variance)

    # Total sample size
    n_pupils = counts.sum(skipna=True)
    # Total number of groups
    n_groups = counts.count()

    # Combine into a series
    result = pd.Series(
        [n_pupils, n_groups, average, std],
        index=['total_pupils', 'group_n', 'group_wt_mean', 'group_wt_std'])
    return(result)

Apply function to each of the groups.

In [5]:
# Define the variables to simultaneously group by (remember, as this is working
# on the aggregated data, it will just have possible combinations between
# the filters of "All" for every characteristic, or "All" for every except one)
groups = ['variable', 'year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']

# Filter to non-nan rows (as other rows can't/won't be used in calculation)
non_nan = data[~(data['mean'].isnull()) & ~(data['count'].isnull())]

# Groupby variable and all the characteristics, and return
wt_mean = (non_nan
           .groupby(groups)
           .apply(lambda x: descriptives(x['mean'], x['count']))
           .reset_index())

# Preview the dataframe
display(wt_mean.head())
display(wt_mean.tail())

Unnamed: 0,variable,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,accept_score,All,All,All,All,693.0,7.0,10.049062,0.258167
1,accept_score,All,All,All,Non-SEN,381.0,7.0,10.023622,0.412174
2,accept_score,All,All,All,SEN,278.0,6.0,10.021583,0.300102
3,accept_score,All,All,FSM,All,338.0,7.0,9.973373,0.409694
4,accept_score,All,All,Non-FSM,All,280.0,7.0,10.064286,0.226055


Unnamed: 0,variable,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
281,wellbeing_score,All,All,Non-FSM,All,210.0,7.0,21.480952,0.662708
282,wellbeing_score,All,Boy,All,All,91.0,6.0,21.406593,0.722824
283,wellbeing_score,All,Girl,All,All,55.0,5.0,20.909091,0.968527
284,wellbeing_score,Year 10,All,All,All,225.0,6.0,21.106667,0.690624
285,wellbeing_score,Year 8,All,All,All,282.0,7.0,21.163121,0.65511


Add the calculated weighted means and SDs back to the school-level results.

In [6]:
# Add the record of the weighted mean and STD back to the school-level results
rag = pd.merge(data, wt_mean, how='left', on=groups)

# Preview the dataframe
display(rag.head())
display(rag.tail())

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,autonomy_score,17.554054,74.0,School A,All,All,All,All,493.0,7.0,17.918864,0.437488
1,life_satisfaction_score,5.113043,115.0,School A,All,All,All,All,725.0,7.0,5.09931,0.308759
2,optimism_score,12.066116,121.0,School A,All,All,All,All,729.0,7.0,12.016461,0.260105
3,wellbeing_score,21.228261,92.0,School A,All,All,All,All,551.0,7.0,21.116152,0.295353
4,esteem_score,12.717949,78.0,School A,All,All,All,All,480.0,7.0,12.754167,0.288944


Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
2011,wealth_score,0.3,30.0,School G,All,All,All,Non-SEN,291.0,7.0,0.357388,0.079114
2012,future_score,7.461538,13.0,School G,All,All,All,Non-SEN,173.0,7.0,7.393064,0.301319
2013,climate_score,2.27451,51.0,School G,All,All,All,Non-SEN,421.0,7.0,2.503563,0.147426
2014,social_score,11.891892,37.0,School G,All,All,All,Non-SEN,283.0,7.0,11.833922,0.189659
2015,bully_score,7.3,50.0,School G,All,All,All,Non-SEN,419.0,7.0,7.610979,0.139296


## Create RAG column based on whether 1SD above or below

All scores are ordered in the positive direction so for every score, a higher result means you are "above average" and a "better outcome", and vice versa for lower scores.

In [7]:
# Find 1 SD above and below mean
rag['lower'] = rag['group_wt_mean'] - rag['group_wt_std']
rag['upper'] = rag['group_wt_mean'] + rag['group_wt_std']

In [8]:
# Create RAG column
conditions = [(rag['mean'] <= rag['lower']),
              (rag['mean'] > rag['lower']) & (rag['mean'] < rag['upper']),
              (rag['mean'] >= rag['upper'])]
choices = ['below', 'average', 'above']
rag.loc[:, 'rag'] = np.select(conditions, choices, default=np.nan)

 Show some examples

In [9]:
# Autonomy score of all pupils at each school
test = rag[
    (rag['variable'] == 'autonomy_score') &
    (rag['year_group_lab'] == 'All') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
display(test)

# Acceptance score of Year 10 pupils at each school
test = rag[
    (rag['variable'] == 'accept_score') &
    (rag['year_group_lab'] == 'Year 10') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
display(test)

Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
0,autonomy_score,17.554054,74.0,School A,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,average
288,autonomy_score,18.298701,77.0,School B,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,average
576,autonomy_score,18.065789,76.0,School C,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,average
864,autonomy_score,17.815385,65.0,School D,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,average
1152,autonomy_score,18.074074,54.0,School E,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,average
1440,autonomy_score,17.157895,76.0,School F,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,below
1728,autonomy_score,18.521127,71.0,School G,All,All,All,All,493.0,7.0,17.918864,0.437488,17.481376,18.356352,above


Unnamed: 0,variable,mean,count,school_lab,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
83,accept_score,9.916667,48.0,School A,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,average
371,accept_score,10.090909,44.0,School B,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,average
659,accept_score,10.298246,57.0,School C,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,above
947,accept_score,9.358974,39.0,School D,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,below
1235,accept_score,9.888889,45.0,School E,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,average
1523,accept_score,,,School F,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,
1811,accept_score,10.181818,44.0,School G,Year 10,All,All,All,277.0,6.0,9.981949,0.292584,9.689366,10.274533,average


## Add topic names

For example, "Autonomy" for "autonomy_score".

In [10]:
# Define the labels for each topic
var_lab = {
    'autonomy_score': 'Autonomy',
    'life_satisfaction_score': 'Life satisfaction',
    'optimism_score': 'Optimism',
    'wellbeing_score': 'Psychological wellbeing',
    'esteem_score': 'Self-esteem',
    'stress_score': 'Stress and coping',
    'appearance_score': 'Feelings around appearance',
    'negative_score': 'Negative affect',
    'lonely_score': 'Loneliness',
    'support_score': 'Supporting own wellbeing',
    'sleep_score': 'Sleep',
    'physical_score': 'Physical activity',
    'free_like_score': 'Free time',
    'media_score': 'Social media use',
    'places_score': 'Places to go and things to do',
    'talk_score': 'Talking about feelings',
    'accept_score': 'Acceptance',
    'school_belong_score': 'School connection',
    'staff_relationship_score': 'Support from staff',
    'home_relationship_score': 'Support from parents/carers',
    'home_happy_score': 'Home environment',
    'local_env_score': 'Local environment',
    'discrim_score': 'Discrimination',
    'belong_local_score': 'Local connection',
    'wealth_score': 'Relative wealth',
    'future_score': 'Future opportunities',
    'climate_score': 'Climate change',
    'social_score': 'Support from friends',
    'bully_score': 'Bullying'
}

# Add label column
rag['variable_lab'] = rag['variable'].map(var_lab)

# View the labels
rag[['variable', 'variable_lab']].drop_duplicates()

Unnamed: 0,variable,variable_lab
0,autonomy_score,Autonomy
1,life_satisfaction_score,Life satisfaction
2,optimism_score,Optimism
3,wellbeing_score,Psychological wellbeing
4,esteem_score,Self-esteem
5,stress_score,Stress and coping
6,appearance_score,Feelings around appearance
7,negative_score,Negative affect
8,lonely_score,Loneliness
9,support_score,Supporting own wellbeing


## Add descriptions

In [11]:
# Create dictionary with the description to accompany each score
describe = {
    'autonomy_score': '''How 'in control' young people feel of their life''',
    'life_satisfaction_score': 'How satisfied young people feel with their life',
    'optimism_score': '''Young people's hopefulness and confidence for the future''',
    'wellbeing_score': 'How positive and generally happy young people feel regarding their life',
    'esteem_score': 'How much young people value themselves',
    'stress_score': 'Managing stress levels and coping with difficulties',
    'appearance_score': '''Young people's feelings around the way that they look''',
    'negative_score': 'The frequency with which young people experience emotional difficulties',
    'lonely_score': 'How often young people feel lonely',
    'support_score': '''Young people's knowledge on supporting themselves and looking for advice''',
    'sleep_score': 'How much sleep young people get',
    'physical_score': 'How physically active young people are',
    'free_like_score': 'How often young people can do things that they like in their free time',
    'media_score': 'How much time young people spend on social media',
    'places_score': '''Whether young people feel there are places to go and things to do in their free time''',
    'talk_score': '''How positively/negatively young people feel about talking with others about feeling down''',
    'accept_score': 'Whether young people feel accepted by different groups of people in their life',
    'school_belong_score': 'Feelings of belonging at school',
    'staff_relationship_score': 'The support received from adults at school',
    'home_relationship_score': 'The support received from adults at home',
    'home_happy_score': '''Young people's feelings regarding the home that they live in''',
    'local_env_score': 'How young people feel regarding the area where they live',
    'discrim_score': 'Whether young people feel discriminated against',
    'belong_local_score': '''Young people's feelings of belonging in their local area''',
    'wealth_score': 'Whether young people feel their family is richer, poorer or the same as their friends',
    'future_score': 'How young people feel regarding the future options for work, education or training in their local area',
    'climate_score': 'Worries regarding climate change',
    'social_score': 'The support young people receive from their peers',
    'bully_score': 'The frequency with which young people experience different types of bullying'
}

# Add labels to dataframe
rag['description'] = rag['variable'].map(describe)

# View each of the topics and their label
rag[['variable_lab', 'description']].drop_duplicates()

Unnamed: 0,variable_lab,description
0,Autonomy,How 'in control' young people feel of their life
1,Life satisfaction,How satisfied young people feel with their life
2,Optimism,Young people's hopefulness and confidence for ...
3,Psychological wellbeing,How positive and generally happy young people ...
4,Self-esteem,How much young people value themselves
5,Stress and coping,Managing stress levels and coping with difficu...
6,Feelings around appearance,Young people's feelings around the way that th...
7,Negative affect,The frequency with which young people experien...
8,Loneliness,How often young people feel lonely
9,Supporting own wellbeing,Young people's knowledge on supporting themsel...


## Save as csv file

In [12]:
rag.to_csv(os.path.join(paths.survey, paths.rag), index=False)