# Aggregate responses

Aggregate the synthetic person-level data to find the proportion who gave each response to each question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

from kailo_beewell_dashboard.create_and_aggregate_data import (
    aggregate_proportions, results_by_site_and_group)
from kailo_beewell_dashboard.response_labels import (
    create_symbol_response_label_dict)

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../data/survey_data'
    synthetic_data = 'symbol_synthetic_data_raw.csv'
    aggregate = 'symbol_school_aggregate_responses.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.data, paths.synthetic_data))
data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,...,symbol_school_lab,symbol_free_lab,symbol_life_lab,gender_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,2.0,2,1.0,1.0,3.0,1,1.0,2.0,2.0,3.0,...,Ok,Ok,Sad,Male,Year 11,FSM,SEN,Ethnic minority,Yes,School B
1,,1,1.0,2.0,2.0,3,2.0,,2.0,1.0,...,,Ok,Happy,Female,Year 10,Non-FSM,Non-SEN,Ethnic minority,Yes,School A
2,,1,2.0,2.0,2.0,1,2.0,2.0,1.0,1.0,...,Ok,Happy,Happy,Female,Year 11,FSM,Non-SEN,White British,Yes,School B
3,,2,3.0,,2.0,1,2.0,1.0,2.0,3.0,...,Happy,Ok,Sad,Female,Year 11,Non-FSM,SEN,White British,Yes,School B
4,3.0,1,2.0,2.0,2.0,2,1.0,2.0,2.0,1.0,...,Ok,Ok,Happy,Female,Year 10,FSM,Non-SEN,White British,No,School B


## Find the proportion giving each response to each measure, within a given group

In [4]:
# Make list of columns that we want to count responses for
response_col = [col for col in data.columns if (
    col.endswith('_lab') and col not in [
        'gender_lab', 'year_group_lab', 'fsm_lab', 'sen_lab', 'ethnicity_lab',
        'english_additional_lab', 'school_lab'])]
response_col

['symbol_family_lab',
 'symbol_home_lab',
 'symbol_friends_lab',
 'symbol_choice_lab',
 'symbol_things_lab',
 'symbol_health_lab',
 'symbol_future_lab',
 'symbol_school_lab',
 'symbol_free_lab',
 'symbol_life_lab']

In [5]:
# Import dictionary which contains the response options for each question,
# for which we want to know the answers to
labels = create_symbol_response_label_dict()

# Add 'NaN': 'No response' to each of the dictionaries
# They are stored as dictionary of dictionaries, so we loop through and
# update each one
for key, value in labels.items():
    value.update({np.nan: 'No response'})

# Preview one of the dictionary items
labels['symbol_family']

{1: 'Happy', 2: 'Ok', 3: 'Sad', nan: 'No response'}

In [6]:
# Create version where every question has count 0, to use when a school has no
# pupils of a particular subgroup (i.e. no-one in certain FSM/SEN/gender/year)
no_pupils = aggregate_proportions(
    data=data, response_col=response_col, labels=labels)
no_pupils[['count', 'percentage', 'n_responses']] = 0
no_pupils.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]",0,0,symbol_family,0
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]",0,0,symbol_home,0
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]",0,0,symbol_friends,0
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]",0,0,symbol_choice,0
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]",0,0,symbol_things,0


In [7]:
# Find results of aggregation for each school and pupil group
result = results_by_site_and_group(
    data=data, agg_func=aggregate_proportions, no_pupils=no_pupils,
    response_col=response_col, labels=labels, group_type='symbol')

# Hide results where n<10
result.loc[result['n_responses'] < 10,
           ['count', 'percentage', 'n_responses']] = np.nan

# Preview head of dataframe
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[8, 6, 13, 3]","[26.666666666666668, 20.0, 43.333333333333336,...",symbol_family,30.0,School A,All,All,All
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[9, 13, 8, 0]","[30.0, 43.333333333333336, 26.666666666666668,...",symbol_home,30.0,School A,All,All,All
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[6, 9, 13, 2]","[20.0, 30.0, 43.333333333333336, 6.66666666666...",symbol_friends,30.0,School A,All,All,All
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[10, 5, 11, 4]","[33.33333333333333, 16.666666666666664, 36.666...",symbol_choice,30.0,School A,All,All,All
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[9, 8, 9, 4]","[30.0, 26.666666666666668, 30.0, 13.3333333333...",symbol_things,30.0,School A,All,All,All


## Add labels for each measure

In [8]:
# Define labels
labels = {
    'symbol_family': 'How do you feel about your family?',
    'symbol_home': 'How do you feel about your home?',
    'symbol_friends': 'How do you feel about your friends?',
    'symbol_choice': 'How do you feel about how much choice you have in life?',
    'symbol_things': 'How do you feel about the things that you have?',
    'symbol_health': 'How do you feel about your health?',
    'symbol_future': 'How do you feel about your future?',
    'symbol_school': 'How do you feel about your school?',
    'symbol_free': 'How do you feel about your free time?',
    'symbol_life': 'How do you feel about your life?'}

# Add labels
result['measure_lab'] = result['measure'].map(labels)

# Preview
result.head()

Unnamed: 0,cat,cat_lab,count,percentage,measure,n_responses,school_lab,year_group_lab,gender_lab,fsm_lab,measure_lab
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[8, 6, 13, 3]","[26.666666666666668, 20.0, 43.333333333333336,...",symbol_family,30.0,School A,All,All,All,How do you feel about your family?
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[9, 13, 8, 0]","[30.0, 43.333333333333336, 26.666666666666668,...",symbol_home,30.0,School A,All,All,All,How do you feel about your home?
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[6, 9, 13, 2]","[20.0, 30.0, 43.333333333333336, 6.66666666666...",symbol_friends,30.0,School A,All,All,All,How do you feel about your friends?
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[10, 5, 11, 4]","[33.33333333333333, 16.666666666666664, 36.666...",symbol_choice,30.0,School A,All,All,All,How do you feel about how much choice you have...
0,"[1, 2, 3, nan]","[Happy, Ok, Sad, No response]","[9, 8, 9, 4]","[30.0, 26.666666666666668, 30.0, 13.3333333333...",symbol_things,30.0,School A,All,All,All,How do you feel about the things that you have?


## Save to CSV

In [9]:
# Show the data types and presence of null
types = result.dtypes
null = result.isnull().any()
pd.DataFrame([types, null]).T

Unnamed: 0,0,1
cat,object,False
cat_lab,object,False
count,object,True
percentage,object,True
measure,object,False
n_responses,float64,True
school_lab,object,False
year_group_lab,object,False
gender_lab,object,False
fsm_lab,object,False


In [10]:
result.to_csv(os.path.join(paths.data, paths.aggregate),
              index=False, na_rep='NULL')