# Find overall counts of responses in each school

This notebooks aims to find the overall count of pupils in the dataset for a given school and group, regardless of whether they answered a particular question.

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
from kailo_beewell_dashboard.synthesise_aggregate import (
    aggregate_counts, results_by_site_and_group)
import numpy as np
import os
import pandas as pd

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../data/survey_data'
    synthetic_data = 'symbol_synthetic_data_raw.csv'
    overall_counts = 'symbol_school_overall_counts.csv'


paths = Paths()

### Import raw data

In [3]:
data = pd.read_csv(os.path.join(paths.data, paths.synthetic_data))
data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,...,symbol_school_lab,symbol_free_lab,symbol_life_lab,gender_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
0,2.0,2,1.0,1.0,3.0,1,1.0,2.0,2.0,3.0,...,Ok,Ok,Sad,Male,Year 11,FSM,SEN,Ethnic minority,Yes,School B
1,,1,1.0,2.0,2.0,3,2.0,,2.0,1.0,...,,Ok,Happy,Female,Year 10,Non-FSM,Non-SEN,Ethnic minority,Yes,School A
2,,1,2.0,2.0,2.0,1,2.0,2.0,1.0,1.0,...,Ok,Happy,Happy,Female,Year 11,FSM,Non-SEN,White British,Yes,School B
3,,2,3.0,,2.0,1,2.0,1.0,2.0,3.0,...,Happy,Ok,Sad,Female,Year 11,Non-FSM,SEN,White British,Yes,School B
4,3.0,1,2.0,2.0,2.0,2,1.0,2.0,2.0,1.0,...,Ok,Ok,Happy,Female,Year 10,FSM,Non-SEN,White British,No,School B


## Create counts dataframe

In [4]:
# Make version for when there are no pupils
no_pupils = aggregate_counts(data)
no_pupils['count'] = 0
no_pupils

Unnamed: 0,count
0,0


In [5]:
# Find counts by school and pupil group
size = results_by_site_and_group(
    data=data, agg_func=aggregate_counts, no_pupils=no_pupils,
    group_type='symbol')

# Hide counts where n<10
size.loc[size['count'] < 10, 'count'] = np.nan

# Preview result
size.sort_values(by=['year_group_lab', 'gender_lab',
                     'fsm_lab', 'school_lab'])

Unnamed: 0,count,school_lab,year_group_lab,gender_lab,fsm_lab
0,30.0,School A,All,All,All
0,30.0,School B,All,All,All
0,17.0,School A,All,All,FSM
0,16.0,School B,All,All,FSM
0,13.0,School A,All,All,Non-FSM
0,14.0,School B,All,All,Non-FSM
0,,School A,All,Boy,All
0,,School B,All,Boy,All
0,,School A,All,Girl,All
0,,School B,All,Girl,All


In [6]:
# Show the data types and presence of null
types = size.dtypes
null = size.isnull().any()
pd.DataFrame([types, null]).T

Unnamed: 0,0,1
count,float64,True
school_lab,object,False
year_group_lab,object,False
gender_lab,object,False
fsm_lab,object,False


## Save results

In [7]:
size.to_csv(os.path.join(paths.data, paths.overall_counts),
            index=False, na_rep='NULL')