# Create synthetic data

Create synthetic data using:
* Headings from cleaned data extract from REDCap
* Dictionaries that were used to create the cleaned data extract and include definitions of what values are present
* Anticipated columns from council
* List of MSOAs in Northern Devon

## Set-up

### Packages, warnings and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import random

In [2]:
# Ignore pandas warning as not relevant as not needing high performance
from warnings import simplefilter
simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
# Import functions defined elsewhere
import sys
sys.path.append('../')
from utilities.response_labels import create_response_label_dict

In [4]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../data'
    headings = 'survey_data/headings.csv'
    msoa_shp = 'MSOA_2021_EW_BFC/MSOA_2021_EW_BFC_V6.shp'
    nd_msoa_shp = 'msoa_nd_2021/msoa_nd_2021.shp'
    synthetic_data = 'survey_data/synthetic_data_raw.csv'


paths = Paths()

### Define sample size

In [5]:
sample_size = 800
school_n = 7

## Create synthetic data

### Import headings

In [6]:
# Import headings from REDCap output to use to create synthetic data
headings = pd.read_csv(os.path.join(paths.data, paths.headings))

# Drop unneeded columns
headings = headings.drop(['Unnamed: 0', 'record_id', 'media_error'], axis=1).columns

# Add anticipated columns from council
headings = list(headings) + [
    'year_group', 'fsm', 'sen', 'ethnicity', 'english_additional', 'school']

# View head and tail
print(headings[:5])
print(headings[-5:])

['gender', 'transgender', 'sexual_orientation', 'neurodivergent', 'birth_parent1']
['fsm', 'sen', 'ethnicity', 'english_additional', 'school']


### Import dictionary of responses labels

This uses a function imported from utilities, as we use this dictionary in multiple places for the dashboard, so it makes more sense to define it in one place and pull from there.

In [7]:
labels = create_response_label_dict()
len(labels)

141

### Randomly sample values from dictionary to populate each column

In [8]:
# Identify measures that would be output by REDCap (drop those I calculated)
measures = [x for x in headings if ~np.logical_or(
    x.endswith('_score'), x.endswith('_lab'))]

# Initialise dataframe with row number matching sample size
data = pd.DataFrame(index=range(1, sample_size+1))

# Set random seed to ensure reproducibility
random.seed(42)

for col in measures:

    # Identify appropriate dictionary of keys and values
    if col in labels:
        label_dict = labels[col]
    # Exceptions for time on social media columns
    elif col == 'media_interact':
        data[col] = random.choices(np.arange(0, 101), k=sample_size)
    elif col == 'media_browse':
        data[col] = 100 - data['media_interact']
    elif col == 'media total':
        data[col] = data['media_interact'] + data['media_browse']

    # Randomly sample with replacement from the possible values for gender
    data[col] = random.choices(list(label_dict.keys()), k=sample_size)

In [9]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school
1,4,2,6,3,2,1,2,1,2,5,...,4,2,1,1,10,0,0,1,0,5
2,1,2,1,3,3,2,3,8,4,2,...,3,3,1,2,10,0,0,1,0,4
3,2,3,4,1,1,1,1,1,5,4,...,3,3,4,1,10,0,0,2,0,5
4,2,5,5,2,2,2,1,3,1,2,...,4,2,2,3,10,0,0,2,0,7
5,5,3,4,1,1,3,3,2,5,2,...,2,2,2,1,8,0,0,2,1,2


### Add a random MSOA for each pupil

Based on all the MSOA in Northern Devon (but in reality anticipate some regions less likely, depending on the schools participating v.s. not)

In [10]:
# Takes 28s to run - instead just import ND shapefile previously created
'''
# Import MSOA shapefile
shp_full = gpd.read_file(os.path.join(paths.data, paths.msoa_shp))

# Filter to MSOA in Northern Devon
shp_nd = shp_full[shp_full['MSOA21NM'].str.contains('North Devon|Torridge')]
shp_nd['MSOA21NM']

# Save to shapefile
shp_nd.to_file(f'{os.getcwd()}/data/msoa_nd_2021/msoa_nd_2021.shp', driver='ESRI Shapefile')
'''

"\n# Import MSOA shapefile\nshp_full = gpd.read_file(os.path.join(paths.data, paths.msoa_shp))\n\n# Filter to MSOA in Northern Devon\nshp_nd = shp_full[shp_full['MSOA21NM'].str.contains('North Devon|Torridge')]\nshp_nd['MSOA21NM']\n\n# Save to shapefile\nshp_nd.to_file(f'{os.getcwd()}/data/msoa_nd_2021/msoa_nd_2021.shp', driver='ESRI Shapefile')\n"

In [11]:
shp_nd = gpd.read_file(os.path.join(paths.data, paths.nd_msoa_shp))
shp_nd['MSOA21NM']

0     North Devon 001
1     North Devon 002
2     North Devon 003
3     North Devon 004
4     North Devon 005
5     North Devon 006
6     North Devon 007
7     North Devon 008
8     North Devon 009
9     North Devon 010
10    North Devon 011
11    North Devon 012
12    North Devon 013
13    North Devon 014
14       Torridge 001
15       Torridge 002
16       Torridge 003
17       Torridge 004
18       Torridge 005
19       Torridge 006
20       Torridge 007
21       Torridge 008
22       Torridge 009
Name: MSOA21NM, dtype: object

In [12]:
# Randomly choose an MSOA for each pupil
data['msoa'] = random.choices(list(shp_nd['MSOA21NM']), k=sample_size)

In [13]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school,msoa
1,4,2,6,3,2,1,2,1,2,5,...,2,1,1,10,0,0,1,0,5,North Devon 006
2,1,2,1,3,3,2,3,8,4,2,...,3,1,2,10,0,0,1,0,4,North Devon 006
3,2,3,4,1,1,1,1,1,5,4,...,3,4,1,10,0,0,2,0,5,North Devon 014
4,2,5,5,2,2,2,1,3,1,2,...,2,2,3,10,0,0,2,0,7,North Devon 009
5,5,3,4,1,1,3,3,2,5,2,...,2,2,1,8,0,0,2,1,2,Torridge 003


### Input some random missing data

Have chosen to exclude school from this - there should not be any pupils with no school (else there is a problem with data processing at some point, but shouldn't be possible).

In [14]:
for col in data.drop('school', axis=1).columns:

    # Randomly choose how many missing data
    n_missing = random.randint(0, 100)

    # Drop that number of values, setting as NaN
    data[col] = data[col].sample(n=len(data) - n_missing)

## Create score columns

In [15]:
def sum_score(df):
    '''
    Find the sum of the provided columns. If any of the required columns contain,
    NaN, it will just return NaN as the result
    Inputs:
    df - pandas DataFrame, just containing the columns you want to sum
    '''
    # Convert to numeric, find sum and return
    return(df.sum(axis=1, skipna=False))

In [16]:
# Gender, transgender, sexual orientation, neurodivergence, and yes/no
# of whether born in UK are not converted to scores

# Age when moved to UK is not used as a "score" persay, but needs
# adjusting to allow you to summarise it. We move age to centre of bucket
# e.g. 1 year old = 1.5, 2 year old = 2.5
# As they are in dataframe, 1 = Under 1, 2 = 1 year old, so subtract .5
# Convert to numeric (can use sum_score()) then subtract .5
data['birth_you_age_score'] = (
    sum_score(pd.DataFrame(data['birth_you_age'])) - 0.5)

In [17]:
# Autonomy
data['autonomy_score'] = sum_score(
    data[['autonomy_pressure', 'autonomy_express', 'autonomy_decide',
          'autonomy_told', 'autonomy_myself', 'autonomy_choice']])

# Life satisfaction requires no changes
data['life_satisfaction_score'] = data['life_satisfaction']

# Optimism
data['optimism_score'] = sum_score(
    data[['optimism_future', 'optimism_best', 'optimism_good', 'optimism_work']])

In [18]:
# Psychological wellbeing
data['wellbeing_score'] = sum_score(
    data[['wellbeing_optimistic', 'wellbeing_useful', 'wellbeing_relaxed',
          'wellbeing_problems', 'wellbeing_thinking', 'wellbeing_close',
          'wellbeing_mind']])

# Self-esteem requires reversed scoring
data['esteem_score'] = sum_score(
    data[['esteem_satisfied', 'esteem_qualities', 'esteem_well',
          'esteem_value', 'esteem_good']].apply(lambda x: x.map(
    {1: 4,
     2: 3,
     3: 2,
     4: 1})))

# Stress requires numbering to start at 0
data['stress_score'] = sum_score(
    data[['stress_control', 'stress_overcome', 'stress_confident',
         'stress_way']] - 1)

# Appearance uses first question, excluding 'prefer not to say'
data['appearance_score'] = data['appearance_happy'].replace(11, np.nan)

In [19]:
# Negative affect requires numbering to start at 0
data['negative_score'] = sum_score(
    data[['negative_lonely', 'negative_unhappy', 'negative_like',
          'negative_cry', 'negative_school', 'negative_worry', 'negative_sleep',
          'negative_wake', 'negative_shy', 'negative_scared']] - 1)

# Loneliness requires reversed scoring (eg. 1 often or always becomes 5)
data['lonely_score'] = data['lonely'].map({
    1: 5,
    2: 4,
    3: 3,
    4: 2,
    5: 1})

# Supporting your wellbeing
data['support_score'] = sum_score(data[['support_ways', 'support_look']])

In [20]:
# Sleep is based on proportion answering 1/Yes so no change required
data['sleep_score'] = data['sleep']

# Physical activity multiplies days by average time per day (which is in min)
data['physical_score'] = data['physical_days']*data['physical_hours']

In [21]:
# Free time/time use is based on proportion responding almost always or often
data['free_like_score'] = data['free_like'].map({1: 1, 2: 1,
                                                 3: 0, 4: 0, 5: 0})
    
# Use of social media requires scores of 0-8 (rather than 1-9)
data['media_score'] = data['media_hours'] - 1

# Places to go and things to do is based on proportion responding several or
# lots to the first question (second question can't be used as score)
data['places_score'] = data['places_freq'].map({1: 0, 2: 0,
                                                3: 1, 4: 1})

For the talking with people question, we are combining several questions with a different number of response options, and with a branding question.

Reminder of scores:
```
'talk': {
    0: 'No',
    1: 'Yes'
},
'talk_listen': {
    1: 'Not at all',
    2: 'Slightly',
    3: 'Mostly',
    4: 'Fully'
},
'talk_helpful': {
    1: 'Not helpful',
    2: 'Somewhat helpful',
    3: 'Very helpful'
},
'talk_if': {
    1: 'Very uncomfortable',
    2: 'Uncomfortable',
    3: 'Comfortable',
    4: 'Very comfortable'
}
```

Method used:
* If answer yes, it is the average of their listen (1-4) and helpful (1-3 but rescaled to 1-4) questions, giving a total of 1-4
* If answer no, it is just their answer to comfortable (1-4)
* The scores for staff, home and peer are then summed, creating an overall score of 3-12.

In [22]:
for prefix in ['staff', 'home', 'peer']:
    # Create the help/listen scores
    data[f'{prefix}_talk_listen_helpful'] = (
    data[f'{prefix}_talk_listen'] +
    data[f'{prefix}_talk_helpful'].map({1: 1, 2: 2.5, 3: 4})) / 2

    # Create score column where choosen "help/listen" or "if" depending on answer to talk
    data[f'{prefix}_talk_score'] = np.where(
        data[f'{prefix}_talk']==1,
        data[f'{prefix}_talk_listen_helpful'],
        data[f'{prefix}_talk_if'])

In [23]:
# Create overall score from sum of staff, home and peer scores
data['talk_score'] = data['staff_talk_score'] + data['home_talk_score'] + data['peer_talk_score']

In [24]:
# Drop columns that were used to calculate scores
data = data.drop(['staff_talk_listen_helpful', 'home_talk_listen_helpful', 'peer_talk_listen_helpful'], axis=1)

In [25]:
data['accept_score'] = sum_score(data[['accept_staff', 'accept_home', 'accept_local', 'accept_peer']])

In [26]:
# School connection
data['school_belong_score'] = data['school_belong']

# Relationships with staff
data['staff_relationship_score'] = sum_score(data[['staff_interest', 'staff_believe', 'staff_best', 'staff_listen']])

In [27]:
# Relationship with parents/carers
data['home_relationship_score'] = sum_score(data[['home_interest', 'home_believe', 'home_best', 'home_listen']])

In [28]:
# Home environment
data['home_happy_score'] = data['home_happy']

# Caring responsibilities and care experience aren't converted to scores

# Local environment
# First question has four responses and one "don't know" (which convert to np.nan)
# We rescale to range from 1 to 5 to match remaining questions which have 1,2,3,4,5 as responses
data['local_safe_rescaled'] = data['local_safe'].map({
    1: 1,
    2: 2 + 1/3,
    3: 3 + 2/3,
    4: 5,
    5: np.nan})
data['local_env_score'] = sum_score(
    data[['local_safe_rescaled', 'local_support', 'local_trust', 'local_neighbours', 'local_places']])
data = data.drop('local_safe_rescaled', axis=1)

# Discrimination
# Proportion who respond often or always / some of the time / occassionally to any of the five questions
# They're not required to have responded to all five, just need to have given one of those responses
# to at least one of those questions
# Identify relevant columns
discrim_col = ['discrim_race', 'discrim_gender', 'discrim_orientation', 'discrim_disability', 'discrim_faith']
# Find if any of them are one of those responses
data['discrim_score'] = (
    data[discrim_col].isin([1, 2, 3]).any(axis=1).map({True: 1, False: 0}))
# Set to NaN if all responses were NaN
data.loc[data[discrim_col].isnull().all(axis=1), 'discrim_score'] = np.nan

# Belonging
# Proportion who respond strongly agree or agree
data['belong_local_score'] = data['belong_local'].map({1: 1, 2: 1,
                                                       3: 0, 4: 0})

# Relative wealth
# Proportion who feel about the same as friends, excluding "don't know"
data['wealth_score'] = data['wealth'].map({1: 0, 2: 1, 3: 0, 4: np.nan})

In [29]:
# Work, education and training opportunities
# Rescale future options so 1-5 (matching future interest and support)
# For all, setting the "unsure" option to np.nan
data['future_score'] = (
    data['future_options'].map({
        1: 1,
        2: 2.5,
        3: 4,
        4: np.nan}) +
    data['future_interest'].replace(5, np.nan) +
    data['future_support'].replace(5, np.nan)
)

# Climate change
# Proportion responding often or sometimes
data['climate_score'] = data['climate'].map({1: 1, 2: 1, 3: 0, 4: 0})

In [30]:
# Friendships and social support
data['social_score'] = sum_score(data[['social_along', 'social_time', 'social_support', 'social_hard']])

# Bullying
data['bully_score'] = sum_score(data[['bully_physical', 'bully_other', 'bully_cyber']])

In [31]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,home_relationship_score,home_happy_score,local_env_score,discrim_score,belong_local_score,wealth_score,future_score,climate_score,social_score,bully_score
1,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,9.0,3.0,11.0,1.0,0.0,,,0.0,13.0,8.0
2,1.0,2.0,1.0,3.0,3.0,2.0,3.0,,4.0,2.0,...,7.0,0.0,20.333333,1.0,0.0,,,1.0,,7.0
3,,3.0,4.0,1.0,1.0,1.0,1.0,1.0,,4.0,...,11.0,4.0,16.0,1.0,,1.0,,1.0,11.0,6.0
4,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,14.0,9.0,,1.0,1.0,,9.0,0.0,,8.0
5,,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,13.0,9.0,13.666667,1.0,,0.0,9.0,0.0,,9.0


## Modify some results

I want to introduce some scenarios to check they are being managed appropriately, such as a school not having any of a certain year group.

In [32]:
# Set school 6/F to only have Year 8s
data.loc[(data['school'] == 6) & (data['year_group'] == 10), 'year_group'] = 8
data.loc[data['school'] == 6, 'year_group'].value_counts(dropna=False)

year_group
8.0    107
NaN      3
Name: count, dtype: int64

In [33]:
# Set school 2/B to have no SEN
data.loc[(data['school'] == 2) & (data['sen'] == 1), 'sen'] = 0
data.loc[data['school'] == 2, 'sen'].value_counts(dropna=False)

sen
0.0    117
NaN     15
Name: count, dtype: int64

## Create label columns

In [34]:
# Define columns that you don't want to make labels for
exclude_col = np.append(
    ['media_interact', 'media_browse', 'media_total', 'msoa'],
    data.columns[data.columns.str.endswith('_score')].values)

# For each column, find the label dictionary
for column in data.columns.drop(exclude_col):
    label_dict = labels[column]
    # Replace the column values based on the labelling dictionary
    data[column + '_lab'] = data[column].map(label_dict)

In [35]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
1,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,Fully,Somewhat helpful,Very uncomfortable,Not at all,Year 10,Non-FSM,Non-SEN,Ethnic minority,No,School E
2,1.0,2.0,1.0,3.0,3.0,2.0,3.0,,4.0,2.0,...,Mostly,Very helpful,Very uncomfortable,Slightly,Year 10,Non-FSM,Non-SEN,Ethnic minority,,School D
3,,3.0,4.0,1.0,1.0,1.0,1.0,1.0,,4.0,...,Mostly,Very helpful,Very comfortable,Not at all,Year 10,Non-FSM,Non-SEN,White British,No,School E
4,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,Fully,Somewhat helpful,Uncomfortable,Mostly,Year 10,Non-FSM,,White British,No,School G
5,,3.0,4.0,1.0,1.0,3.0,3.0,2.0,5.0,2.0,...,Slightly,Somewhat helpful,Uncomfortable,Not at all,Year 8,Non-FSM,Non-SEN,,Yes,School B


In [36]:
data[['media_hours', 'media_hours_lab']].drop_duplicates()

Unnamed: 0,media_hours,media_hours_lab
1,5.0,3 to 4 hours
2,3.0,1 to 2 hours
3,7.0,5 to 6 hours
7,2.0,Less than 1 hour
8,6.0,4 to 5 hours
11,4.0,2 to 3 hours
13,8.0,6 to 7 hours
14,1.0,
20,9.0,7 hours or more
28,,


## Save data

In [37]:
data.to_csv(os.path.join(paths.data, paths.synthetic_data), index=False)