# Create synthetic data

Create synthetic data using:
* Headings from cleaned data extract from REDCap
* Dictionaries that were used to create the cleaned data extract and include definitions of what values are present
* Anticipated columns from council
* List of MSOAs in Northern Devon

## Set-up

### Packages, warnings and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import random

In [2]:
# Ignore pandas warning as not relevant as not needing high performance
from warnings import simplefilter
simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
# Import functions defined elsewhere
from functions import calculate_scores

import sys
sys.path.append('../')
from utilities.response_labels import create_response_label_dict

In [4]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../data'
    headings = 'survey_data/headings.csv'
    msoa_shp = 'MSOA_2021_EW_BFC/MSOA_2021_EW_BFC_V6.shp'
    nd_msoa_shp = 'msoa_nd_2021/msoa_nd_2021.shp'
    synthetic_data = 'survey_data/synthetic_data_raw.csv'


paths = Paths()

### Define sample size

In [5]:
sample_size = 800
school_n = 7

## Create synthetic data

### Import headings

In [6]:
# Import headings from REDCap output to use to create synthetic data
headings = pd.read_csv(os.path.join(paths.data, paths.headings))

# Drop unneeded columns
headings = headings.drop(['Unnamed: 0', 'record_id', 'media_error'], axis=1).columns

# Add anticipated columns from council
headings = list(headings) + [
    'year_group', 'fsm', 'sen', 'ethnicity', 'english_additional', 'school']

# View head and tail
print(headings[:5])
print(headings[-5:])

['gender', 'transgender', 'sexual_orientation', 'neurodivergent', 'birth_parent1']
['fsm', 'sen', 'ethnicity', 'english_additional', 'school']


### Import dictionary of responses labels

This uses a function imported from utilities, as we use this dictionary in multiple places for the dashboard, so it makes more sense to define it in one place and pull from there.

In [7]:
labels = create_response_label_dict()
len(labels)

141

### Randomly sample values from dictionary to populate each column

In [8]:
# Identify measures that would be output by REDCap (drop those I calculated)
measures = [x for x in headings if ~np.logical_or(
    x.endswith('_score'), x.endswith('_lab'))]

# Initialise dataframe with row number matching sample size
data = pd.DataFrame(index=range(1, sample_size+1))

for col in measures:

    # Identify appropriate dictionary of keys and values
    if col in labels:
        label_dict = labels[col]
    # Exceptions for time on social media columns
    elif col == 'media_interact':
        data[col] = random.choices(np.arange(0, 101), k=sample_size)
    elif col == 'media_browse':
        data[col] = 100 - data['media_interact']
    elif col == 'media total':
        data[col] = data['media_interact'] + data['media_browse']

    # Randomly sample with replacement from the possible values for gender
    # Random(42) sets the random seed
    data[col] = random.Random(42).choices(list(label_dict.keys()), k=sample_size)

In [9]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school
1,4,4,4,2,2,2,2,11,4,4,...,3,2,3,3,10,1,1,2,1,5
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,8,0,0,1,0,1
3,2,2,2,1,1,1,1,5,2,2,...,2,1,2,2,8,0,0,1,0,2
4,2,2,2,1,1,1,1,4,2,2,...,1,1,1,1,8,0,0,1,0,2
5,5,4,5,3,3,3,3,12,4,4,...,3,3,3,3,10,1,1,2,1,6


### Add a random MSOA for each pupil

Based on all the MSOA in Northern Devon (but in reality anticipate some regions less likely, depending on the schools participating v.s. not)

In [10]:
# Takes 28s to run - instead just import ND shapefile previously created
'''
# Import MSOA shapefile
shp_full = gpd.read_file(os.path.join(paths.data, paths.msoa_shp))

# Filter to MSOA in Northern Devon
shp_nd = shp_full[shp_full['MSOA21NM'].str.contains('North Devon|Torridge')]
shp_nd['MSOA21NM']

# Save to shapefile
shp_nd.to_file(f'{os.getcwd()}/data/msoa_nd_2021/msoa_nd_2021.shp', driver='ESRI Shapefile')
'''

# Import the Northern Devon shapefile
shp_nd = gpd.read_file(os.path.join(paths.data, paths.nd_msoa_shp))

# Show the MSOA present in this file
display(shp_nd['MSOA21NM'])

# Randomly choose an MSOA for each pupil
data['msoa'] = random.Random(42).choices(list(shp_nd['MSOA21NM']), k=sample_size)

# Preview the dataframe
data.head()

0     North Devon 001
1     North Devon 002
2     North Devon 003
3     North Devon 004
4     North Devon 005
5     North Devon 006
6     North Devon 007
7     North Devon 008
8     North Devon 009
9     North Devon 010
10    North Devon 011
11    North Devon 012
12    North Devon 013
13    North Devon 014
14       Torridge 001
15       Torridge 002
16       Torridge 003
17       Torridge 004
18       Torridge 005
19       Torridge 006
20       Torridge 007
21       Torridge 008
22       Torridge 009
Name: MSOA21NM, dtype: object

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school,msoa
1,4,4,4,2,2,2,2,11,4,4,...,2,3,3,10,1,1,2,1,5,Torridge 001
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,8,0,0,1,0,1,North Devon 001
3,2,2,2,1,1,1,1,5,2,2,...,1,2,2,8,0,0,1,0,2,North Devon 007
4,2,2,2,1,1,1,1,4,2,2,...,1,1,1,8,0,0,1,0,2,North Devon 006
5,5,4,5,3,3,3,3,12,4,4,...,3,3,3,10,1,1,2,1,6,Torridge 003


### Simulate the branching logic for the "talking with.." questions

Input missing data to mimic how pupils will only be able to answer one question or the other, and not both.

In [11]:
# Simulate branching logic by setting forcing half to missing and having it
# populated in the other - repeating for each of the three groups of people.
# i is used for the random seed
i = 1
for prefix in ['staff', 'home', 'peer']:

    # Create list with the columns in each branch
    branch_1 = [f'{prefix}_talk_listen', f'{prefix}_talk_helpful']
    branch_2 = [f'{prefix}_talk_if']

    # Create mask to randomly select half the rows
    mask = [False]*(len(data.index)//2) + [True]*(len(data.index)//2)
    random.Random(i).shuffle(mask)

    # Set to NaN (inverse to each other)
    data.loc[mask, branch_1] = np.nan
    data.loc[[not x for x in mask], branch_2] = np.nan

    # Increment the counter to a different seed for random.shuffle()
    i += 1

In [12]:
# Demonstrate the branching logic created on the example of staff
(data[['staff_talk_listen', 'staff_talk_helpful', 'staff_talk_if']]
 .isnull()
 .replace({True: 'NaN', False: 'Value'})
 .value_counts(dropna=False)
 .reset_index())

Unnamed: 0,staff_talk_listen,staff_talk_helpful,staff_talk_if,count
0,,,Value,400
1,Value,Value,,400


### Input some random missing data

Have chosen to exclude school from this - there should not be any pupils with no school (else there is a problem with data processing at some point, but shouldn't be possible).

In [13]:
# For each of the columns except school...
for col in data.drop('school', axis=1).columns:

    # Randomly choose how many missing data
    n_missing = random.Random(42).randint(0, 100)

    # Drop that number of values, setting as NaN
    data[col] = data[col].sample(n=len(data) - n_missing)

### Input some non-random missing data

I want to introduce some scenarios to check they are being managed appropriately, such as a school not having any of a certain year group.

In [14]:
# Set school 6/F to only have Year 8s
data.loc[(data['school'] == 6) & (data['year_group'] == 10), 'year_group'] = 8
data.loc[data['school'] == 6, 'year_group'].value_counts(dropna=False)

year_group
8.0    96
NaN    16
Name: count, dtype: int64

In [15]:
# Set school 2/B to have no SEN
data.loc[(data['school'] == 2) & (data['sen'] == 1), 'sen'] = 0
data.loc[data['school'] == 2, 'sen'].value_counts(dropna=False)

sen
0.0    112
NaN     16
Name: count, dtype: int64

## Create score columns

In [16]:
# Use function to calculate scores for each pupil
data = calculate_scores(data)

# Preview the dataframe
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,home_relationship_score,home_happy_score,local_env_score,discrim_score,belong_local_score,wealth_score,future_score,climate_score,social_score,bully_score
1,4.0,,4.0,2.0,2.0,2.0,,11.0,4.0,4.0,...,,7.0,9.0,1.0,2.0,1.0,12.0,3.0,,6.0
2,1.0,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,...,4.0,0.0,25.0,0.0,4.0,0.0,3.0,1.0,4.0,12.0
3,2.0,2.0,2.0,1.0,1.0,,1.0,5.0,2.0,2.0,...,8.0,3.0,19.666667,0.0,3.0,0.0,6.5,2.0,8.0,
4,2.0,2.0,2.0,,1.0,1.0,,4.0,2.0,2.0,...,8.0,2.0,19.666667,0.0,4.0,0.0,5.0,1.0,8.0,
5,5.0,4.0,5.0,,3.0,3.0,3.0,12.0,4.0,,...,16.0,8.0,9.0,1.0,2.0,1.0,12.0,3.0,16.0,6.0


## Create label columns

This utilises the labels dictionary that we imported above.

In [17]:
# Define columns that you don't want to make labels for
exclude_col = np.append(
    ['media_interact', 'media_browse', 'media_total', 'msoa'],
    data.columns[data.columns.str.endswith('_score')].values)

# For each column, find the label dictionary
for column in data.columns.drop(exclude_col):
    label_dict = labels[column]
    # Replace the column values based on the labelling dictionary
    data[column + '_lab'] = data[column].map(label_dict)

# Preview the dataframe
display(data.head())

# Show the example of media and it's labels (which is one where the numeric
# version is not intuitive)
display(data[['media_hours', 'media_hours_lab']].drop_duplicates())

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
1,4.0,,4.0,2.0,2.0,2.0,,11.0,4.0,4.0,...,,,Comfortable,Mostly,Year 10,FSM,SEN,White British,Yes,School E
2,1.0,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,...,,,Very uncomfortable,Not at all,Year 8,Non-FSM,,Ethnic minority,No,School A
3,2.0,2.0,2.0,1.0,1.0,,1.0,5.0,2.0,2.0,...,,,Uncomfortable,Slightly,Year 8,Non-FSM,Non-SEN,Ethnic minority,No,School B
4,2.0,2.0,2.0,,1.0,1.0,,4.0,2.0,2.0,...,Not at all,Not helpful,,Not at all,Year 8,Non-FSM,Non-SEN,Ethnic minority,,School B
5,5.0,4.0,5.0,,3.0,3.0,3.0,12.0,4.0,,...,,Very helpful,,,Year 8,FSM,SEN,White British,Yes,School F


Unnamed: 0,media_hours,media_hours_lab
1,6.0,4 to 5 hours
2,1.0,
3,3.0,1 to 2 hours
4,,
5,7.0,5 to 6 hours
7,9.0,7 hours or more
9,4.0,2 to 3 hours
11,2.0,Less than 1 hour
16,5.0,3 to 4 hours
19,8.0,6 to 7 hours


## Save data

In [18]:
data.to_csv(os.path.join(paths.data, paths.synthetic_data), index=False)