# Create synthetic data

Create synthetic data using:
* Headings from cleaned data extract from REDCap
* Dictionaries that were used to create the cleaned data extract and include definitions of what values are present

## Set-up

### Packages, warnings and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import random

In [2]:
# Ignore pandas warning as not relevant as not needing high performance
from warnings import simplefilter
simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = './data'
    headings = 'survey_data/headings.csv'
    msoa_shp = 'MSOA_2021_EW_BFC/MSOA_2021_EW_BFC_V6.shp'
    nd_msoa_shp = 'msoa_nd_2021/msoa_nd_2021.shp'
    synthetic_data = 'survey_data/synthetic_data_raw.csv'


paths = Paths()

### Define sample size

In [4]:
sample_size = 800
school_n = 7

## Create synthetic data

### Import headings

In [5]:
headings = pd.read_csv(os.path.join(paths.data, paths.headings))
headings = headings.drop(['record_id', 'media_error'], axis=1).columns
headings

Index(['gender', 'transgender', 'sexual_orientation', 'neurodivergent',
       'birth_parent1', 'birth_parent2', 'birth_you', 'birth_you_age',
       'autonomy_pressure', 'autonomy_express',
       ...
       'social_support_lab', 'social_hard_lab', 'bully_physical_lab',
       'bully_other_lab', 'bully_cyber_lab', 'peer_talk_lab',
       'peer_talk_listen_lab', 'peer_talk_helpful_lab', 'peer_talk_if_lab',
       'accept_peer_lab'],
      dtype='object', length=268)

### Add copies of the labels dictionaries

In [6]:
# Define the labels to use for different columns
labels = {
    'gender': {
        1: 'Girl',
        2: 'Boy',
        3: 'Non-binary',
        4: 'I describe myself in another way',
        5: 'Currently unsure',
        6: 'Prefer not to say'
    },
    'transgender': {
        1: 'Yes',
        2: 'No',
        3: 'Prefer not to say',
        4: 'I describe myself in another way',
        5: 'Not sure',
    },
    'sexual_orientation': {
        1: 'Bi/pansexual',
        2: 'Gay/lesbian',
        3: 'Heterosexual/straight',
        4: 'I describe myself in another way',
        5: 'Currently unsure',
        6: 'Prefer not to say'
    },
    'neurodivergent': {
        1: 'Yes',
        2: 'No',
        3: 'Unsure'
    },
    'birth': {
        1: 'Yes',
        2: 'No',
        3: "I don't know"
    },
    'birth_you_age': {
        1: 'Under 1 year old',
        2: '1 year old',
        3: '2 years old',
        4: '3 years old',
        5: '4 years old',
        6: '5 years old',
        7: '6 years old',
        8: '7 years old',
        9: '8 years old',
        10: '9 years old',
        11: '10 years old',
        12: '11 years old',
        13: '12 years old',
        14: '13 years old',
        15: '14 years old',
        16: '15 years old'
    },
    'autonomy': {
        1: '1 - Completely not true',
        2: '2',
        3: '3',
        4: '4',
        5: '5 - Completely true'
    },
    'life_satisfaction':{
        0: '0 - not at all',
        1: '1',
        2: '2',
        3: '3',
        4: '4',
        5: '5',
        6: '6',
        7: '7',
        8: '8',
        9: '9',
        10: '10 - completely'
    },
    'optimism_future': {
        1: 'Almost never',
        2: 'Sometimes',
        3: 'Often',
        4: 'Very often',
        5: 'Always'
    },
    'optimism_other': {
        1: 'Not at all like me',
        2: 'A little like me',
        3: 'Somewhat like me',
        4: 'Mostly like me',
        5: 'Very much like me'
    },
    'wellbeing': {
        1: 'None of the time',
        2: 'Rarely',
        3: 'Some of the time',
        4: 'Often',
        5: 'All of the time'
    },
    'esteem': {
        1: 'Strongly agree',
        2: 'Agree',
        3: 'Disagree',
        4: 'Strongly disagree'
    },
    'stress': {
        1: 'Never',
        2: 'Almost Never',
        3: 'Sometimes',
        4: 'Fairly Often',
        5: 'Very Often'
    },
    'appearance_happy': {
        0: '0 - Very unhappy',
        1: '1',
        2: '2',
        3: '3',
        4: '4',
        5: '5 - Not happy or unhappy',
        6: '6',
        7: '7',
        8: '8',
        9: '9',
        10: '10 - Very happy',
        11: 'Prefer not to say'
    },
    'appearance_feel': {
        1: 'Strongly agree',
        2: 'Agree',
        3: 'Disagree',
        4: 'Strongly disagree',
        5: 'Prefer not to say'
    },
    'negative': {
        1: 'Never',
        2: 'Sometimes',
        3: 'Always'
    },
    'lonely': {
        1: 'Often or always',
        2: 'Some of the time',
        3: 'Occasionally',
        4: 'Hardly ever',
        5: 'Never'
    },
    'support': {
        1: 'Strongly agree',
        2: 'Agree',
        3: 'Disagree',
        4: 'Strongly disagree'
    },
    'physical_days': {
        0: '0 days',
        1: '1 day',
        2: '2 days',
        3: '3 days',
        4: '4 days',
        5: '5 days',
        6: '6 days',
        7: '7 days'
    },
    'physical_hours': {
        30: 'Around 30 minutes',
        60: 'Around 1 hour',
        90: 'Around 1.5 hours',
        120: 'Around 2 hours or more'
    },
    'free_like': {
        1: 'Almost always',
        2: 'Often',
        3: 'Sometimes',
        4: 'Not often',
        5: 'Almost never'
    },
    'media_hours': {
        1: 'None',
        2: 'Less than 1 hour',
        3: '1 hour to less than 2 hours',
        4: '2 hours to less than 3 hours',
        5: '3 hours to less than 4 hours',
        6: '4 hours to less than 5 hours',
        7: '5 hours to less than 6 hours',
        8: '6 hours to less than 7 hours',
        9: '7 hours or more'
    },
    'sleep': {
        0: 'No',
        1: 'Yes'
    },
    'places_freq': {
        1: 'None',
        2: 'Limited',
        3: 'Several',
        4: 'Lots'
    },
    'places_barriers': {
        0: 'No',
        1: 'Yes'
    },
    'school_belong': {
        1: 'Not at all',
        2: 'A little',
        3: 'Somewhat',
        4: 'Quite a bit',
        5: 'A lot'
    },
    'relationships': {
        1: '1 - Never',
        2: '2',
        3: '3',
        4: '4',
        5: '5 - Always'
    },
    'talk': {
        0: 'No',
        1: 'Yes'
    },
    'talk_listen': {
        1: 'Not at all',
        2: 'Slightly',
        3: 'Mostly',
        4: 'Fully'
    },
    'talk_helpful': {
        1: 'Not helpful',
        2: 'Somewhat helpful',
        3: 'Very helpful'
    },
    'talk_if': {
        1: 'Very uncomfortable',
        2: 'Uncomfortable',
        3: 'Comfortable',
        4: 'Very comfortable'
    },
    'accept': {
        1: 'Not at all',
        2: 'Slightly',
        3: 'Mostly',
        4: 'Fully'
    },
    'home_happy': {
        0: '0 - Very unhappy',
        1: '1',
        2: '2',
        3: '3',
        4: '4',
        5: '5 - Not happy or unhappy',
        6: '6',
        7: '7',
        8: '8',
        9: '9',
        10: '10 - Very happy'
    },
    'care_experience': {
        1: 'Yes',
        0: 'No',
        2: 'Unsure'
    },
    'young_carer': {
        0: 'No',
        1: 'Yes'
    },
    'local_safe': {
        1: 'Very safe',
        2: 'Fairly safe',
        3: 'Fairly unsafe',
        4: 'Very unsafe',
        5: "Don't know"
    },
    'local_other': {
        1: 'Strongly agree',
        2: 'Agree',
        3: 'Neither agree nor disagree',
        4: 'Disagree',
        5: 'Strongly disagree'
    },
    'discrim': {
        1: 'Often or always',
        2: 'Some of the time',
        3: 'Occasionally',
        4: 'Hardly ever',
        5: 'Never'
    },
    'belong_local': {
        1: 'Strongly agree',
        2: 'Agree',
        3: 'Disagree',
        4: 'Strongly disagree'
    },
    'wealth': {
        1: 'Richer',
        2: 'Poorer',
        3: 'About the same',
        4: "Don't know"
    },
    'future_options': {
        1: 'Not many',
        2: 'Quite a few',
        3: 'A lot',
        4: 'Unsure'
    },
    'future_interest': {
        1: 'Not interested',
        2: 'A little interested',
        3: 'Quite interested',
        4: 'Very interested',
        5: 'Unsure'
    },
    'future_support': {
        1: 'Not at all',
        2: 'Slightly',
        3: 'Mostly',
        4: 'Fully',
        5: 'Unsure'
    },
    'climate': {
        1: 'Often',
        2: 'Sometimes',
        3: 'Rarely',
        4: 'Never'
    },
    'social': {
        1: 'Not at all',
        2: 'A little',
        3: 'Somewhat',
        4: 'Quite a bit',
        5: 'A lot'
    },
    'bully': {
        1: 'Not bullied at all',
        2: '1-3 times in the last 6 months',
        3: '4 or more times in the last 6 months',
        4: 'A few times a week'
    }
}

In [7]:
def add_keys(keys, value):
    '''
    Add multiple keys with the same value to the dictionary
    Inputs:
    keys: Array with the keys
    value: String which is the value for all the keys
    '''
    not_identical.update(dict.fromkeys(keys, value))

    
not_identical = {}

add_keys(['birth_parent1', 'birth_parent2', 'birth_you'], 'birth')
add_keys(['autonomy_pressure', 'autonomy_express', 'autonomy_decide',
          'autonomy_told', 'autonomy_myself', 'autonomy_choice'], 'autonomy')
add_keys(['optimism_best', 'optimism_good', 'optimism_work'], 'optimism_other')
add_keys(['wellbeing_optimistic', 'wellbeing_useful', 'wellbeing_relaxed',
          'wellbeing_problems', 'wellbeing_thinking', 'wellbeing_close',
          'wellbeing_mind'], 'wellbeing')
add_keys(['esteem_satisfied', 'esteem_qualities', 'esteem_well',
          'esteem_value', 'esteem_good'], 'esteem')
add_keys(['stress_control', 'stress_overcome', 'stress_confident',
          'stress_way'], 'stress')
add_keys(['negative_lonely', 'negative_unhappy', 'negative_like',
          'negative_cry', 'negative_school', 'negative_worry', 'negative_sleep',
          'negative_wake', 'negative_shy', 'negative_scared'], 'negative')
add_keys(['support_ways', 'support_look'], 'support')
add_keys(['places_barriers___1', 'places_barriers___2', 'places_barriers___3',
          'places_barriers___4', 'places_barriers___5', 'places_barriers___6',
          'places_barriers___7', 'places_barriers___8', 'places_barriers___9'],
         'places_barriers')
add_keys(['staff_interest', 'staff_believe', 'staff_best', 'staff_listen',
          'home_interest', 'home_believe', 'home_best', 'home_listen'],
          'relationships')
add_keys(['staff_talk', 'home_talk', 'peer_talk'], 'talk')
add_keys(['staff_talk_listen', 'home_talk_listen', 'peer_talk_listen'], 'talk_listen')
add_keys(['staff_talk_helpful', 'home_talk_helpful', 'peer_talk_helpful'], 'talk_helpful')
add_keys(['staff_talk_if', 'home_talk_if', 'peer_talk_if'], 'talk_if')
add_keys(['accept_staff', 'accept_home', 'accept_local',
          'accept_peer'], 'accept')
add_keys(['local_support', 'local_trust', 'local_neighbours',
          'local_places'], 'local_other')
add_keys(['discrim_race', 'discrim_gender', 'discrim_orientation',
          'discrim_disability', 'discrim_faith'], 'discrim')
add_keys(['social_along', 'social_time', 'social_support', 'social_hard'], 'social')
add_keys(['bully_physical', 'bully_other', 'bully_cyber'], 'bully')

### Randomly sample values from dictionary to populate each column

In [8]:
# Identify measures that would be output by REDCap (drop those I calculated)
measures = headings[~np.logical_or(headings.str.endswith('_score'),
                                   headings.str.endswith('_lab'))]

# Initialise dataframe with row number matching sample size
data = pd.DataFrame(index=range(1, sample_size+1))

# Set random seed to ensure reproducibility
random.seed(42)

for col in measures:

    # Identify appropriate dictionary of keys and values
    if col in labels:
        label_dict = labels[col]
    elif col in not_identical:
        label_dict = labels[not_identical[col]]

    # Exceptions for time on social media columns
    elif col == 'media_interact':
        data[col] = random.choices(np.arange(0, 101), k=sample_size)
    elif col == 'media_browse':
        data[col] = 100 - data['media_interact']
    elif col == 'media total':
        data[col] = data['media_interact'] + data['media_browse']

    # Randomly sample with replacement from the possible values for gender
    data[col] = random.choices(list(label_dict.keys()), k=sample_size)

In [9]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,social_support,social_hard,bully_physical,bully_other,bully_cyber,peer_talk,peer_talk_listen,peer_talk_helpful,peer_talk_if,accept_peer
1,4,2,6,3,2,1,2,1,2,5,...,2,2,3,2,3,1,4,2,1,1
2,1,2,1,3,3,2,3,8,4,2,...,2,5,1,4,2,0,3,3,1,2
3,2,3,4,1,1,1,1,1,5,4,...,3,1,2,3,1,0,3,3,4,1
4,2,5,5,2,2,2,1,3,1,2,...,2,5,1,3,4,0,4,2,2,3
5,5,3,4,1,1,3,3,2,5,2,...,5,1,4,1,4,1,2,2,2,1


### Add synthetic data for variables that expect to receive from council

In [10]:
council_labels = {
    'year_group': {
        8: 8,
        10: 10
    },
    'fsm': {
        0: 'No',
        1: 'Yes'
    },
    'sen': {
        0: 'No',
        1: 'Yes'
    },
    'ethnicity': {
        1: 'ethnic_minority',
        2: 'white_british'
    },
    'english_additional': {
        0: 'No',
        1: 'Yes'
    },
    'school': {
        1: 'School A',
        2: 'School B',
        3: 'School C',
        4: 'School D',
        5: 'School E',
        6: 'School F',
        7: 'School G'
    }
}

In [11]:
for var in council_labels.keys():
    data[var] = random.choices(list(council_labels[var].keys()), k=sample_size)

In [12]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_listen,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school
1,4,2,6,3,2,1,2,1,2,5,...,4,2,1,1,10,0,0,1,0,5
2,1,2,1,3,3,2,3,8,4,2,...,3,3,1,2,10,0,0,1,0,4
3,2,3,4,1,1,1,1,1,5,4,...,3,3,4,1,10,0,0,2,0,5
4,2,5,5,2,2,2,1,3,1,2,...,4,2,2,3,10,0,0,2,0,7
5,5,3,4,1,1,3,3,2,5,2,...,2,2,2,1,8,0,0,2,1,2


### Add a random MSOA for each pupil

Based on all the MSOA in Northern Devon (but in reality anticipate some regions less likely, depending on the schools participating v.s. not)

In [13]:
# Takes 28s to run - instead just import ND shapefile previously created
'''
# Import MSOA shapefile
shp_full = gpd.read_file(os.path.join(paths.data, paths.msoa_shp))

# Filter to MSOA in Northern Devon
shp_nd = shp_full[shp_full['MSOA21NM'].str.contains('North Devon|Torridge')]
shp_nd['MSOA21NM']

# Save to shapefile
shp_nd.to_file(f'{os.getcwd()}/data/msoa_nd_2021/msoa_nd_2021.shp', driver='ESRI Shapefile')
'''

"\n# Import MSOA shapefile\nshp_full = gpd.read_file(os.path.join(paths.data, paths.msoa_shp))\n\n# Filter to MSOA in Northern Devon\nshp_nd = shp_full[shp_full['MSOA21NM'].str.contains('North Devon|Torridge')]\nshp_nd['MSOA21NM']\n\n# Save to shapefile\nshp_nd.to_file(f'{os.getcwd()}/data/msoa_nd_2021/msoa_nd_2021.shp', driver='ESRI Shapefile')\n"

In [14]:
shp_nd = gpd.read_file(os.path.join(paths.data, paths.nd_msoa_shp))
shp_nd['MSOA21NM']

0     North Devon 001
1     North Devon 002
2     North Devon 003
3     North Devon 004
4     North Devon 005
5     North Devon 006
6     North Devon 007
7     North Devon 008
8     North Devon 009
9     North Devon 010
10    North Devon 011
11    North Devon 012
12    North Devon 013
13    North Devon 014
14       Torridge 001
15       Torridge 002
16       Torridge 003
17       Torridge 004
18       Torridge 005
19       Torridge 006
20       Torridge 007
21       Torridge 008
22       Torridge 009
Name: MSOA21NM, dtype: object

In [15]:
# Randomly choose an MSOA for each pupil
data['msoa'] = random.choices(list(shp_nd['MSOA21NM']), k=sample_size)

In [16]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,peer_talk_helpful,peer_talk_if,accept_peer,year_group,fsm,sen,ethnicity,english_additional,school,msoa
1,4,2,6,3,2,1,2,1,2,5,...,2,1,1,10,0,0,1,0,5,North Devon 006
2,1,2,1,3,3,2,3,8,4,2,...,3,1,2,10,0,0,1,0,4,North Devon 006
3,2,3,4,1,1,1,1,1,5,4,...,3,4,1,10,0,0,2,0,5,North Devon 014
4,2,5,5,2,2,2,1,3,1,2,...,2,2,3,10,0,0,2,0,7,North Devon 009
5,5,3,4,1,1,3,3,2,5,2,...,2,2,1,8,0,0,2,1,2,Torridge 003


### Input some random missing data

In [17]:
for col in data.columns:
    # Randomly choose how many missing data
    n_missing = random.randint(0, 100)

    # Drop that number of values, setting as NaN
    data[col] = data[col].sample(n=len(data) - n_missing)

## Create score columns

In [18]:
def sum_score(df):
    '''
    Find the sum of the provided columns. If any of the required columns contain,
    NaN, it will just return NaN as the result
    Inputs:
    df - pandas DataFrame, just containing the columns you want to sum
    '''
    # Convert to numeric, find sum and return
    return(df.sum(axis=1, skipna=False))

In [19]:
# Gender, transgender, sexual orientation, neurodivergence, and yes/no
# of whether born in UK are not converted to scores

# Age when moved to UK is not used as a "score" persay, but needs
# adjusting to allow you to summarise it. We move age to centre of bucket
# e.g. 1 year old = 1.5, 2 year old = 2.5
# As they are in dataframe, 1 = Under 1, 2 = 1 year old, so subtract .5
# Convert to numeric (can use sum_score()) then subtract .5
data['birth_you_age_score'] = (
    sum_score(pd.DataFrame(data['birth_you_age'])) - 0.5)

In [20]:
# Autonomy
data['autonomy_score'] = sum_score(
    data[['autonomy_pressure', 'autonomy_express', 'autonomy_decide',
          'autonomy_told', 'autonomy_myself', 'autonomy_choice']])

# Life satisfaction requires no changes
data['life_satisfaction_score'] = data['life_satisfaction']

# Optimism
data['optimism_score'] = sum_score(
    data[['optimism_future', 'optimism_best', 'optimism_good', 'optimism_work']])

In [21]:
# Psychological wellbeing
data['wellbeing_score'] = sum_score(
    data[['wellbeing_optimistic', 'wellbeing_useful', 'wellbeing_relaxed',
          'wellbeing_problems', 'wellbeing_thinking', 'wellbeing_close',
          'wellbeing_mind']])

# Self-esteem requires reversed scoring
data['esteem_score'] = sum_score(
    data[['esteem_satisfied', 'esteem_qualities', 'esteem_well',
          'esteem_value', 'esteem_good']].apply(lambda x: x.map(
    {1: 4,
     2: 3,
     3: 2,
     4: 1})))

# Stress requires numbering to start at 0
data['stress_score'] = sum_score(
    data[['stress_control', 'stress_overcome', 'stress_confident',
         'stress_way']] - 1)

# Appearance uses first question, excluding 'prefer not to say'
data['appearance_score'] = data['appearance_happy'].replace(11, np.nan)

In [22]:
# Negative affect requires numbering to start at 0
data['negative_score'] = sum_score(
    data[['negative_lonely', 'negative_unhappy', 'negative_like',
          'negative_cry', 'negative_school', 'negative_worry', 'negative_sleep',
          'negative_wake', 'negative_shy', 'negative_scared']] - 1)

# Loneliness
# TO DO

# Supporting your wellbeing
data['support_score'] = sum_score(data[['support_ways', 'support_look']])

In [23]:
# Sleep is based on proportion answering 1/Yes so no change required
data['sleep_score'] = data['sleep']

# Physical activity multiplies days by average time per day (which is in min)
data['physical_score'] = data['physical_days']*data['physical_hours']

In [24]:
# Free time/time use is based on proportion responding almost always or often
data['free_like_score'] = data['free_like'].map({1: 1, 2: 1,
                                                 3: 0, 4: 0, 5: 0})
    
# Use of social media requires scores of 0-8 (rather than 1-9)
data['media_score'] = data['media_hours'] - 1

# Places to go and things to do is based on proportion responding several or
# lots to the first question (second question can't be used as score)
data['places_score'] = data['places_freq'].map({1: 0, 2: 0,
                                                3: 1, 4: 1})

In [25]:
for prefix in ['staff', 'home', 'peer']:
    # Create the help/listen scores
    data[f'{prefix}_talk_listen_helpful'] = (
    data[f'{prefix}_talk_listen'] +
    data[f'{prefix}_talk_helpful'].map({1: 1, 2: 2.5, 3: 4})) / 2

    # Create score column where choosen "help/listen" or "if" depending on answer to talk
    data[f'{prefix}_talk_score'] = np.where(
        data[f'{prefix}_talk']==1,
        data[f'{prefix}_talk_listen_helpful'],
        data[f'{prefix}_talk_if'])

In [26]:
# Create overall score from sum of staff, home and peer scores
data['talk_score'] = data['staff_talk_score'] + data['home_talk_score'] + data['peer_talk_score']

In [27]:
# Drop columns that were used to calculate scores
data = data.drop(['staff_talk_listen_helpful', 'home_talk_listen_helpful', 'peer_talk_listen_helpful'], axis=1)

In [28]:
data['accept_score'] = sum_score(data[['accept_staff', 'accept_home', 'accept_local', 'accept_peer']])

In [29]:
# School connection
data['school_belong_score'] = data['school_belong']

# Relationships with staff
data['staff_relationship_score'] = sum_score(data[['staff_interest', 'staff_believe', 'staff_best', 'staff_listen']])

In [30]:
# Relationship with parents/carers
data['home_relationship_score'] = sum_score(data[['home_interest', 'home_believe', 'home_best', 'home_listen']])

In [31]:
# Home environment
data['home_happy_score'] = data['home_happy']

# Caring responsibilities and care experience aren't converted to scores

# Local environment
# First question has four responses and one "don't know" (which convert to np.nan)
# We rescale to range from 1 to 5 to match remaining questions which have 1,2,3,4,5 as responses
data['local_safe_rescaled'] = data['local_safe'].map({
    1: 1,
    2: 2 + 1/3,
    3: 3 + 2/3,
    4: 5,
    5: np.nan})
data['local_env_score'] = sum_score(
    data[['local_safe_rescaled', 'local_support', 'local_trust', 'local_neighbours', 'local_places']])
data = data.drop('local_safe_rescaled', axis=1)

# Discrimination
# Proportion who respond often or always / some of the time / occassionally to any of the five questions
# They're not required to have responded to all five, just need to have given one of those responses
# to at least one of those questions
# Identify relevant columns
discrim_col = ['discrim_race', 'discrim_gender', 'discrim_orientation', 'discrim_disability', 'discrim_faith']
# Find if any of them are one of those responses
data['discrim_score'] = data[discrim_col].isin([1, 2, 3]).any(axis=1)
# Set to NaN if all responses were NaN
data.loc[data[discrim_col].isnull().all(axis=1), 'discrim_score'] = np.nan

# Belonging
# Proportion who respond strongly agree or agree
data['belong_local_score'] = data['belong_local'].map({1: 1, 2: 1,
                                                       3: 0, 4: 0})

# Relative wealth
# Proportion who feel about the same as friends, excluding "don't know"
data['wealth_score'] = data['wealth'].map({1: 0, 2: 1, 3: 0, 4: np.nan})

In [32]:
# Work, education and training opportunities
# Rescale future options so 1-5 (matching future interest and support)
# For all, setting the "unsure" option to np.nan
data['future_score'] = (
    data['future_options'].map({
        1: 1,
        2: 2.5,
        3: 4,
        4: np.nan}) +
    data['future_interest'].replace(5, np.nan) +
    data['future_support'].replace(5, np.nan)
)

# Climate change
# Proportion responding often or sometimes
data['climate_score'] = data['climate'].map({1: 1, 2: 1, 3: 0, 4: 0})

In [33]:
# Friendships and social support
data['social_score'] = sum_score(data[['social_along', 'social_time', 'social_support', 'social_hard']])

# Bullying
data['bully_score'] = sum_score(data[['bully_physical', 'bully_other', 'bully_cyber']])

In [34]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,home_relationship_score,home_happy_score,local_env_score,discrim_score,belong_local_score,wealth_score,future_score,climate_score,social_score,bully_score
1,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,9.0,3.0,11.0,True,0.0,,,0.0,13.0,8.0
2,1.0,2.0,1.0,,3.0,2.0,3.0,8.0,,2.0,...,,0.0,20.333333,True,0.0,,,1.0,12.0,7.0
3,2.0,3.0,4.0,1.0,1.0,1.0,,1.0,5.0,4.0,...,,4.0,,True,1.0,1.0,,1.0,,6.0
4,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,,,,True,1.0,,9.0,,12.0,8.0
5,5.0,,4.0,1.0,1.0,3.0,3.0,,5.0,2.0,...,,9.0,13.666667,True,0.0,0.0,9.0,0.0,,9.0


## Create label columns

In [35]:
# Define columns that you don't want to make labels for
exclude_col = np.append(
    ['media_interact', 'media_browse', 'media_total', 'year_group', 'fsm',
     'sen', 'ethnicity', 'english_additional', 'school', 'msoa'],
    data.columns[data.columns.str.endswith('_score')].values)

# For each column, find the label dictionary (either matches the column name, or
# name found from the 'not_identical' dictionary)
for column in data.columns.drop(exclude_col):
    if column in labels:
        label_dict = labels[column]
    else:
        label_dict = labels[not_identical[column]]
    # Replace the column values based on the labelling dictionary
    data[column + '_lab'] = data[column].map(label_dict)

In [36]:
data.head()

Unnamed: 0,gender,transgender,sexual_orientation,neurodivergent,birth_parent1,birth_parent2,birth_you,birth_you_age,autonomy_pressure,autonomy_express,...,social_support_lab,social_hard_lab,bully_physical_lab,bully_other_lab,bully_cyber_lab,peer_talk_lab,peer_talk_listen_lab,peer_talk_helpful_lab,peer_talk_if_lab,accept_peer_lab
1,4.0,2.0,6.0,3.0,2.0,1.0,2.0,1.0,2.0,5.0,...,A little,A little,4 or more times in the last 6 months,1-3 times in the last 6 months,4 or more times in the last 6 months,Yes,Fully,Somewhat helpful,Very uncomfortable,Not at all
2,1.0,2.0,1.0,,3.0,2.0,3.0,8.0,,2.0,...,A little,A lot,Not bullied at all,A few times a week,1-3 times in the last 6 months,No,Mostly,Very helpful,Very uncomfortable,Slightly
3,2.0,3.0,4.0,1.0,1.0,1.0,,1.0,5.0,4.0,...,Somewhat,,1-3 times in the last 6 months,4 or more times in the last 6 months,Not bullied at all,No,Mostly,Very helpful,Very comfortable,Not at all
4,2.0,5.0,5.0,2.0,2.0,2.0,1.0,3.0,1.0,2.0,...,A little,A lot,Not bullied at all,4 or more times in the last 6 months,A few times a week,No,Fully,Somewhat helpful,Uncomfortable,Mostly
5,5.0,,4.0,1.0,1.0,3.0,3.0,,5.0,2.0,...,,Not at all,A few times a week,Not bullied at all,A few times a week,Yes,Slightly,Somewhat helpful,Uncomfortable,Not at all


## Save data

In [37]:
data.to_csv(os.path.join(paths.data, paths.synthetic_data), index=False)