## Create RAG ratings for the aggregate data

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import math
import numpy as np
import os
import pandas as pd
#from statsmodels.stats.weightstats import DescrStatsW

# Set display options
pd.set_option('display.max_rows', 100)

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Paths to data and files'''
    survey = '../data/survey_data'
    aggregate = 'aggregate_scores.csv'
    rag = 'aggregate_scores_rag.csv'


paths = Paths()

## Import aggregate scores

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.aggregate))
data.head()

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
0,School A,birth_you_age_score,7.793578,109.0,All,All,All,All
1,School B,birth_you_age_score,7.754545,110.0,All,All,All,All
2,School C,birth_you_age_score,8.202128,94.0,All,All,All,All
3,School D,birth_you_age_score,8.155172,87.0,All,All,All,All
4,School E,birth_you_age_score,7.574074,81.0,All,All,All,All


## Find weighted mean and SD within each group

In [4]:
def descriptives(values, counts):
    '''
    Calculates:
    * Total sample size (across the schools)
    * Total number of groups without NaN (e.g. schools, areas) used in calc
    * Weighted average of the means
    * Standard deviation of the means
    This normalises weights so they sum 1 (and so they can't all be 0).
    It returns the biased variance and is like a weighted version of np.std().
    For small samples, may want to alter to unbiased variance.
    Based on: https://stackoverflow.com/questions/2413522/weighted-standard-deviation-in-numpy
    Inputs:
    - values - series, to calculate mean and std from
    - counts - series, number of students, used to weight calculations
    Output:
    - result - series with each of the calculations (index is name of calc)
    '''
    # Total sample size
    n_pupils = counts.sum(skipna=True)

    # Total number of groups used in other calc - count number of non-NaN rows
    n_groups = counts.count()

    # Weighted mean
    average = np.average(values, weights=counts)

    # Weighted std
    variance = np.average((values-average)**2, weights=counts)
    std = math.sqrt(variance)

    # Combine into a series
    result = pd.Series(
        [n_pupils, n_groups, average, std],
        index=['total_pupils', 'group_n', 'group_wt_mean', 'group_wt_std'])
    return(result)

In [5]:
groups = ['variable', 'year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
wt_mean = (data
           .groupby(groups)
           .apply(lambda x: descriptives(x['mean'], x['count']))
           .reset_index())
wt_mean

Unnamed: 0,variable,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,accept_score,10.0,All,All,All,237.0,6.0,10.088608,0.280805
1,accept_score,8.0,All,All,All,309.0,7.0,10.093851,0.363222
2,accept_score,All,All,All,All,558.0,7.0,10.096774,0.312259
3,accept_score,All,All,All,No,299.0,7.0,10.150502,0.345487
4,accept_score,All,All,All,Yes,200.0,6.0,9.970000,0.379847
...,...,...,...,...,...,...,...,...,...
437,wellbeing_score,All,Currently unsure,All,All,37.0,3.0,,
438,wellbeing_score,All,Girl,All,All,59.0,5.0,,
439,wellbeing_score,All,I describe myself in another way,All,All,33.0,3.0,,
440,wellbeing_score,All,Non-binary,All,All,11.0,1.0,,


In [6]:
rag = pd.merge(data, wt_mean, how='left', on=groups)
rag

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,School A,birth_you_age_score,7.793578,109.0,All,All,All,All,666.0,7.0,7.902402,0.241061
1,School B,birth_you_age_score,7.754545,110.0,All,All,All,All,666.0,7.0,7.902402,0.241061
2,School C,birth_you_age_score,8.202128,94.0,All,All,All,All,666.0,7.0,7.902402,0.241061
3,School D,birth_you_age_score,8.155172,87.0,All,All,All,All,666.0,7.0,7.902402,0.241061
4,School E,birth_you_age_score,7.574074,81.0,All,All,All,All,666.0,7.0,7.902402,0.241061
...,...,...,...,...,...,...,...,...,...,...,...,...
3021,School E,overall_count,,40.0,All,All,All,Yes,263.0,6.0,,
3022,School F,overall_count,,42.0,All,All,All,No,379.0,7.0,,
3023,School F,overall_count,,40.0,All,All,All,Yes,263.0,6.0,,
3024,School G,overall_count,,49.0,All,All,All,No,379.0,7.0,,


## Create RAG column based on whether 1SD above or below

<mark>remember the reverse scored things</mark>

In [7]:
# Find 1 SD above and below mean
rag['lower'] = rag['group_wt_mean'] - rag['group_wt_std']
rag['upper'] = rag['group_wt_mean'] + rag['group_wt_std']

In [8]:
# Create RAG column
conditions = [(rag['mean'] <= rag['lower']),
              (rag['mean'] > rag['lower']) & (rag['mean'] < rag['upper']),
              (rag['mean'] >= rag['upper'])]
choices = ['below', 'average', 'above']
rag.loc[:, 'rag'] = np.select(conditions, choices, default=np.nan)

Show some examples

In [9]:
test = rag[
    (rag['variable'] == 'autonomy_score') &
    (rag['year_group_lab'] == 'All') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
7,School A,autonomy_score,18.191781,73.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,average
8,School B,autonomy_score,18.382353,68.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,average
9,School C,autonomy_score,18.066667,60.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,average
10,School D,autonomy_score,17.277778,54.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,below
11,School E,autonomy_score,18.0,62.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,average
12,School F,autonomy_score,17.032258,62.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,below
13,School G,autonomy_score,18.375,64.0,All,All,All,All,443.0,7.0,17.930023,0.489056,17.440967,18.419078,average


In [10]:
test = data[
    (data['variable'] == 'accept_score') &
    (data['year_group_lab'] == '10.0') &
    (data['gender_lab'] == 'All') &
    (data['fsm_lab'] == 'All') &
    (data['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
492,School A,accept_score,10.027778,36.0,10.0,All,All,All
494,School B,accept_score,9.914286,35.0,10.0,All,All,All
496,School C,accept_score,10.265306,49.0,10.0,All,All,All
498,School D,accept_score,9.627907,43.0,10.0,All,All,All
500,School E,accept_score,10.194444,36.0,10.0,All,All,All
503,School G,accept_score,10.5,38.0,10.0,All,All,All


In [11]:
test = rag[
    (rag['variable'] == 'accept_score') &
    (rag['year_group_lab'] == '10.0') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
492,School A,accept_score,10.027778,36.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,average
494,School B,accept_score,9.914286,35.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,average
496,School C,accept_score,10.265306,49.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,average
498,School D,accept_score,9.627907,43.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,below
500,School E,accept_score,10.194444,36.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,average
503,School G,accept_score,10.5,38.0,10.0,All,All,All,237.0,6.0,10.088608,0.280805,9.807803,10.369412,above


<mark>Can see that, currently, the aggregate_data script has missing rows (as school F had no year 10, it hasn't calculated, but we want it to calculate and say NaN)... or is it ok as is?</mark>

## Set to NaN if not relevant

In [12]:
# Birth you age score is just average birth age and not needed as a "score"
# Overall count is the count of pupils in each group (not for specific variable)
mask = rag['variable'].isin(['birth_you_age_score', 'overall_count'])
nan_col = ['group_n', 'group_wt_mean', 'group_wt_std', 'lower', 'upper', 'rag']
rag.loc[mask, nan_col] = np.nan
rag[mask]

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
0,School A,birth_you_age_score,7.793578,109.0,All,All,All,All,666.0,,,,,,
1,School B,birth_you_age_score,7.754545,110.0,All,All,All,All,666.0,,,,,,
2,School C,birth_you_age_score,8.202128,94.0,All,All,All,All,666.0,,,,,,
3,School D,birth_you_age_score,8.155172,87.0,All,All,All,All,666.0,,,,,,
4,School E,birth_you_age_score,7.574074,81.0,All,All,All,All,666.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021,School E,overall_count,,40.0,All,All,All,Yes,263.0,,,,,,
3022,School F,overall_count,,42.0,All,All,All,No,379.0,,,,,,
3023,School F,overall_count,,40.0,All,All,All,Yes,263.0,,,,,,
3024,School G,overall_count,,49.0,All,All,All,No,379.0,,,,,,


## Add labels

In [13]:
var_lab = {
    'autonomy_score': 'Autonomy',
    'life_satisfaction_score': 'Life satisfaction',
    'optimism_score': 'Optimism',
    'wellbeing_score': 'Psychological wellbeing',
    'esteem_score': 'Self-esteem',
    'stress_score': 'Stress and coping',
    'appearance_score': 'Feelings around appearance',
    'negative_score': 'Negative affect',
    'lonely_score': 'Loneliness',
    'support_score': 'Supporting own wellbeing',
    'sleep_score': 'Sleep',
    'physical_score': 'Physical activity',
    'free_like_score': 'Free time',
    'media_score': 'Social media use',
    'places_score': 'Places to go and things to do',
    'talk_score': 'Talking with others about feelings',
    'accept_score': 'Acceptance',
    'school_belong_score': 'School connection',
    'staff_relationship_score': 'Relationships with staff',
    'home_relationship_score': 'Relationships with parents/carers',
    'home_happy_score': 'Home environment',
    'local_env_score': 'Local environment',
    'discrim_score': 'Discrimination',
    'belong_local_score': 'Local connection',
    'wealth_score': 'Relative wealth',
    'future_score': 'Future work, education and/or training',
    'climate_score': 'Climate change',
    'social_score': 'Friendships and social support',
    'bully_score': 'Bullying'
}

In [14]:
# Add label column
rag['variable_lab'] = rag['variable'].map(var_lab)

# View the labels
rag[['variable', 'variable_lab']].drop_duplicates()

Unnamed: 0,variable,variable_lab
0,birth_you_age_score,
7,autonomy_score,Autonomy
14,life_satisfaction_score,Life satisfaction
21,optimism_score,Optimism
28,wellbeing_score,Psychological wellbeing
35,esteem_score,Self-esteem
42,stress_score,Stress and coping
49,appearance_score,Feelings around appearance
56,negative_score,Negative affect
63,lonely_score,Loneliness


## Save as csv file

In [15]:
rag.to_csv(os.path.join(paths.survey, paths.rag), index=False)