# Aggregate demographic data

This is similar to `4_aggregate_responses.ipynb` but performed seperatley as we want to aggregate for one school and then v.s. all other schools as one group - and we don't want to breakdown those results any further by any demographic characteristics.

**Note:** This method assumes that all pupils in the dataset belong to a school, and that the dataset only contains Northern Devon schools. If either of those conditions changes, you will need to review the code below.

## Set-up

### Packages and file paths

In [None]:
# Import required packages
from collections import defaultdict
from dataclasses import dataclass
import numpy as np
import os
import pandas as pd

In [None]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    survey = '../data/survey_data'
    synthetic_data = 'synthetic_data_raw.csv'
    aggregate = 'aggregate_demographic.csv'


paths = Paths()

In [None]:
# Import functions defined elsewhere
import sys
sys.path.append('../')
from utilities.response_labels import create_response_label_dict
from create_and_process_data.functions import results_by_school_and_group

### Import raw data

In [None]:
data = pd.read_csv(os.path.join(paths.survey, paths.synthetic_data))
data.head()

### Aggregate data

In [None]:
# Make list of columns that we want to gather responses from
survey_col = ['gender', 'transgender', 'sexual_orientation', 'neurodivergent',
              'birth_parent1', 'birth_parent2', 'birth_you', 'birth_you_age',
              'young_carer', 'care_experience']
council_col = ['year_group', 'fsm', 'sen', 'ethnicity', 'english_additional']
response_col = survey_col + council_col

# Import dictionary which has response options for each variable
labels = create_response_label_dict()

# Add 'NaN': 'No response' to the dictionary for survey columns, and
# 'NaN': 'No data' for the council columns
for col in survey_col:
    labels[col].update({np.nan: 'No response'})
for col in council_col:
    labels[col].update({np.nan: 'No data'})

# Preview two examples
print(labels['birth_parent1'])
print(labels['year_group'])

Define function for aggregating responses to each variable, designed so that it counts the occurence of all possible values, rather than basing it on only the values present (e.g. if e.g. no-one responded 3, it will just have counts of responses to 1 2 and 4, and when it comes to plotting, that can create issues).

**Copied from 4_aggregate_response**

In [None]:
def aggregate(data):
    '''
    Aggregates each of the columns provided by response_col, for the chosen
    dataset. Uses the known possible values for each column, it counts
    occurences of each (inc. number missing) and makes the answer as a single
    dataframe row, where counts and percentages and categories are stored as
    lists within cells of that row. The function returns a dataframe containing
    all of those rows.
    Inputs:
    - data - dataframe
    '''
    # Initialise list to store rows of the dataframe
    rows = list()

    # Loop through the columns of interest
    for col_lab in response_col:

        # Find the name of the numeric version of the column
        col = col_lab.replace('_lab', '')

        # Find value counts
        value_counts = data[col].value_counts(dropna=False)

        # Get all possible category values and labels from dictionary
        cat = list(labels[col].keys())
        cat_lab = list(labels[col].values())

        # Initalise list for storing counts
        counts = []
        # For each of the possible values in labels - if the value was present,
        # extract from the counts series, but if not, set count to 0
        for value in labels[col].keys():
            if value in value_counts.index:
                counts.append(value_counts[value])
            else:
                counts.append(0)

        # Convert list of counts to list of percentages, and create rounded version
        percentages = [(x/sum(counts))*100 for x in counts]

        # Create dataframe row using the calculated data
        df_row = pd.DataFrame({
            'cat': [cat],
            'cat_lab': [cat_lab],
            'count': [counts],
            'percentage': [percentages],
            'measure': col,
            'n_responses': sum(counts)
        })
        # Append to list
        rows.append(df_row)

    # Combine into a single dataframe and return
    return(pd.concat(rows))


**below is similar but not the same as aggregate_responses - see where differences ultimately arise and decide whether this can be combined into the old function and how (think should be able to)**

In [None]:
# Initialise list to store results
result_list = list()

# For each of the schools (which we know will all be present at least once
# as we base the school list on the dataset itself)
schools = data['school_lab'].dropna().drop_duplicates().sort_values()
for school in schools:

    # Add label identifying the school as being the current one or now
    data['school_group'] = np.where(data['school_lab'] == school, 1, 0)

    # Loop through each of those groups (current school v.s. all other schools)
    for group in [1,0]:

        # Filter to the group and then aggregate the data
        to_agg = data[data['school_group'] == group]
        res = aggregate(to_agg)

        # Label with the group
        res['school_lab'] = school
        res['school_group'] = group

        # Append results to list
        result_list.append(res)

# Combine all the results into a single dataframe
result = pd.concat(result_list)

# Hide results where n<10
result.loc[result['n_responses'] < 10,
           ['count', 'percentage', 'n_responses']] = np.nan

# Add labels that can use in figures
result['school_group_lab'] = np.where(
    result['school_group'] == 1, 'Your school', 'Other schools in\nNorthern Devon')

In [None]:
result.head()

### Add labels for each measure

In [None]:
# Define labels
labels = {
    'gender': 'Gender',
    'transgender': 'Do you consider yourself to be transgender?',
    'sexual_orientation': 'Sexual orientation',
    'neurodivergent': 'Do you identify as neurodivergent?',
    'young_carer': 'In the last year, have you regularly taken on caring responsibilities for a family member - e.g. due to illness, disability, mental health condition or drug/alcohol dependency?',
    'care_experience': 'Are you or have you ever been in care (living in a foster placement, residential placement, or private/kinship care)?',
    'birth_parent1': 'Was birth parent 1 born outside the UK?',
    'birth_parent2': 'Was birth parent 2 born outside the UK?',
    'birth_you': 'Were you born outside the UK?',
    'birth_you_age': 'How old were you when you arrived in the UK?',
    'year_group': 'Year group',
    'fsm': 'Free school meals',
    'sen': 'Special educational needs',
    'ethnicity': 'Ethnicity',
    'english_additional': 'English as an additional lanaguage'
}

# Add labels
result['measure_lab'] = result['measure'].map(labels)

# Preview
result.head()

### Save results

In [None]:
result.to_csv(os.path.join(paths.survey, paths.aggregate), index=False)