## Create RAG ratings for the aggregate data

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import math
import numpy as np
import os
import pandas as pd
#from statsmodels.stats.weightstats import DescrStatsW

# Set display options
pd.set_option('display.max_rows', 100)

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Paths to data and files'''
    survey = '../data/survey_data'
    aggregate = 'aggregate_scores.csv'
    rag = 'aggregate_scores_rag.csv'


paths = Paths()

## Import aggregate scores

In [3]:
data = pd.read_csv(os.path.join(paths.survey, paths.aggregate))
data.head()

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
0,School A,birth_you_age_score,7.584112,107.0,All,All,All,All
1,School B,birth_you_age_score,7.657407,108.0,All,All,All,All
2,School C,birth_you_age_score,8.510417,96.0,All,All,All,All
3,School D,birth_you_age_score,7.815789,95.0,All,All,All,All
4,School E,birth_you_age_score,7.77381,84.0,All,All,All,All


## Find weighted mean and SD within each group

In [4]:
def descriptives(values, counts):
    '''
    Calculates:
    * Total sample size (across the schools)
    * Total number of groups without NaN (e.g. schools, areas) used in calc
    * Weighted average of the means
    * Standard deviation of the means
    This normalises weights so they sum 1 (and so they can't all be 0).
    It returns the biased variance and is like a weighted version of np.std().
    For small samples, may want to alter to unbiased variance.
    Based on: https://stackoverflow.com/questions/2413522/weighted-standard-deviation-in-numpy
    Inputs:
    - values - series, to calculate mean and std from
    - counts - series, number of students, used to weight calculations
    Output:
    - result - series with each of the calculations (index is name of calc)
    '''
    # Total sample size
    n_pupils = counts.sum(skipna=True)

    # Total number of groups used in other calc - count number of non-NaN rows
    n_groups = counts.count()

    # Weighted mean
    average = np.average(values, weights=counts)

    # Weighted std
    variance = np.average((values-average)**2, weights=counts)
    std = math.sqrt(variance)

    # Combine into a series
    result = pd.Series(
        [n_pupils, n_groups, average, std],
        index=['total_pupils', 'group_n', 'group_wt_mean', 'group_wt_std'])
    return(result)

In [5]:
groups = ['variable', 'year_group_lab', 'gender_lab', 'fsm_lab', 'sen_lab']
wt_mean = (data
           .groupby(groups)
           .apply(lambda x: descriptives(x['mean'], x['count']))
           .reset_index())
wt_mean

Unnamed: 0,variable,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,accept_score,10.0,All,All,All,231.0,6.0,10.043290,0.225162
1,accept_score,8.0,All,All,All,302.0,7.0,10.009934,0.409676
2,accept_score,All,All,All,All,546.0,7.0,10.009158,0.271743
3,accept_score,All,All,All,No,288.0,7.0,10.045139,0.361406
4,accept_score,All,All,All,Yes,202.0,6.0,9.955446,0.436508
...,...,...,...,...,...,...,...,...,...
424,wellbeing_score,All,Currently unsure,All,All,31.0,3.0,,
425,wellbeing_score,All,Girl,All,All,21.0,2.0,,
426,wellbeing_score,All,I describe myself in another way,All,All,38.0,3.0,,
427,wellbeing_score,All,Non-binary,All,All,48.0,4.0,,


In [6]:
rag = pd.merge(data, wt_mean, how='left', on=groups)
rag

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std
0,School A,birth_you_age_score,7.584112,107.0,All,All,All,All,668.0,7.0,7.916168,0.338493
1,School B,birth_you_age_score,7.657407,108.0,All,All,All,All,668.0,7.0,7.916168,0.338493
2,School C,birth_you_age_score,8.510417,96.0,All,All,All,All,668.0,7.0,7.916168,0.338493
3,School D,birth_you_age_score,7.815789,95.0,All,All,All,All,668.0,7.0,7.916168,0.338493
4,School E,birth_you_age_score,7.773810,84.0,All,All,All,All,668.0,7.0,7.916168,0.338493
...,...,...,...,...,...,...,...,...,...,...,...,...
2932,School E,overall_count,,48.0,All,All,All,Yes,269.0,6.0,,
2933,School F,overall_count,,44.0,All,All,All,No,379.0,7.0,,
2934,School F,overall_count,,40.0,All,All,All,Yes,269.0,6.0,,
2935,School G,overall_count,,47.0,All,All,All,No,379.0,7.0,,


## Create RAG column based on whether 1SD above or below

<mark>remember the reverse scored things</mark>

In [7]:
# Find 1 SD above and below mean
rag['lower'] = rag['group_wt_mean'] - rag['group_wt_std']
rag['upper'] = rag['group_wt_mean'] + rag['group_wt_std']

In [8]:
# Create RAG column
conditions = [(rag['mean'] <= rag['lower']),
              (rag['mean'] > rag['lower']) & (rag['mean'] < rag['upper']),
              (rag['mean'] >= rag['upper'])]
choices = ['below', 'average', 'above']
rag.loc[:, 'rag'] = np.select(conditions, choices, default=np.nan)

Show some examples

In [9]:
test = rag[
    (rag['variable'] == 'autonomy_score') &
    (rag['year_group_lab'] == 'All') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
7,School A,autonomy_score,17.930556,72.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,average
8,School B,autonomy_score,17.68,75.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,average
9,School C,autonomy_score,17.764706,68.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,average
10,School D,autonomy_score,18.157895,57.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,average
11,School E,autonomy_score,18.5,54.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,above
12,School F,autonomy_score,17.290323,62.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,below
13,School G,autonomy_score,18.722222,54.0,All,All,All,All,442.0,7.0,17.968326,0.440233,17.528092,18.408559,above


In [10]:
test = data[
    (data['variable'] == 'accept_score') &
    (data['year_group_lab'] == '10.0') &
    (data['gender_lab'] == 'All') &
    (data['fsm_lab'] == 'All') &
    (data['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab
472,School A,accept_score,10.175,40.0,10.0,All,All,All
474,School B,accept_score,10.315789,38.0,10.0,All,All,All
476,School C,accept_score,9.976744,43.0,10.0,All,All,All
478,School D,accept_score,9.657143,35.0,10.0,All,All,All
480,School E,accept_score,9.878049,41.0,10.0,All,All,All
483,School G,accept_score,10.264706,34.0,10.0,All,All,All


In [11]:
test = rag[
    (rag['variable'] == 'accept_score') &
    (rag['year_group_lab'] == '10.0') &
    (rag['gender_lab'] == 'All') &
    (rag['fsm_lab'] == 'All') &
    (rag['sen_lab'] == 'All')]
test

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
472,School A,accept_score,10.175,40.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,average
474,School B,accept_score,10.315789,38.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,above
476,School C,accept_score,9.976744,43.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,average
478,School D,accept_score,9.657143,35.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,below
480,School E,accept_score,9.878049,41.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,average
483,School G,accept_score,10.264706,34.0,10.0,All,All,All,231.0,6.0,10.04329,0.225162,9.818128,10.268452,average


<mark>Can see that, currently, the aggregate_data script has missing rows (as school F had no year 10, it hasn't calculated, but we want it to calculate and say NaN)</mark>

## Set to NaN if not relevant

In [12]:
# Birth you age score is just average birth age and not needed as a "score"
# Overall count is the count of pupils in each group (not for specific variable)
mask = rag['variable'].isin(['birth_you_age_score', 'overall_count'])
nan_col = ['group_n', 'group_wt_mean', 'group_wt_std', 'lower', 'upper', 'rag']
rag.loc[mask, nan_col] = np.nan
rag[mask]

Unnamed: 0,school_lab,variable,mean,count,year_group_lab,gender_lab,fsm_lab,sen_lab,total_pupils,group_n,group_wt_mean,group_wt_std,lower,upper,rag
0,School A,birth_you_age_score,7.584112,107.0,All,All,All,All,668.0,,,,,,
1,School B,birth_you_age_score,7.657407,108.0,All,All,All,All,668.0,,,,,,
2,School C,birth_you_age_score,8.510417,96.0,All,All,All,All,668.0,,,,,,
3,School D,birth_you_age_score,7.815789,95.0,All,All,All,All,668.0,,,,,,
4,School E,birth_you_age_score,7.773810,84.0,All,All,All,All,668.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,School E,overall_count,,48.0,All,All,All,Yes,269.0,,,,,,
2933,School F,overall_count,,44.0,All,All,All,No,379.0,,,,,,
2934,School F,overall_count,,40.0,All,All,All,Yes,269.0,,,,,,
2935,School G,overall_count,,47.0,All,All,All,No,379.0,,,,,,


## Save as csv file

In [13]:
rag.to_csv(os.path.join(paths.survey, paths.rag), index=False)