# Create synthetic data

Create synthetic data using:
* Headings from cleaned data extract from REDCap
* Dictionaries that were used to create the cleaned data extract and include definitions of what values are present
* Anticipated columns from council

## Set-up

### Packages and file paths

In [1]:
# Import required packages
from dataclasses import dataclass
import os
import pandas as pd
import random

from kailo_beewell_dashboard.response_labels import create_symbol_response_label_dict

In [2]:
# File paths
@dataclass(frozen=True)
class Paths:
    '''Stores paths to data and files'''
    data = '../data/survey_data'
    headings = 'headings_symbol.csv'
    synthetic_data = 'synthetic_data_raw.csv'


paths = Paths()

### Define sample size

In [3]:
sample_size = 60

## Create sythetic data

### Import headings

In [4]:
# Import headings from REDCap output to use to create synthetic data
headings = pd.read_csv(os.path.join(paths.data, paths.headings))

# Drop unneeded column
headings = headings.drop(['Unnamed: 0'], axis=1).columns

# Add anticipated columns from council
headings = list(headings) + ['gender', 'year_group', 'fsm', 'sen', 'ethnicity',
                             'english_additional', 'school']

# View headings
headings

['symbol_family',
 'symbol_home',
 'symbol_friends',
 'symbol_choice',
 'symbol_things',
 'symbol_health',
 'symbol_future',
 'symbol_school',
 'symbol_free',
 'symbol_life',
 'gender',
 'year_group',
 'fsm',
 'sen',
 'ethnicity',
 'english_additional',
 'school']

### Import dictionary of response labels

This uses a function imported from utilities, as we use this dictionary in multiple places for the dashboard, so it makes more sense to define it in one place and pull from there.

In [5]:
labels = create_symbol_response_label_dict()
labels

{'symbol': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'gender': {0: 'Male', 1: 'Female'},
 'year_group': {7: 'Year 7',
  8: 'Year 8',
  9: 'Year 9',
  10: 'Year 10',
  11: 'Year 11'},
 'fsm': {0: 'Non-FSM', 1: 'FSM'},
 'sen': {0: 'Non-SEN', 1: 'SEN'},
 'ethnicity': {1: 'Ethnic minority', 2: 'White British'},
 'english_additional': {0: 'No', 1: 'Yes'},
 'school': {1: 'School A', 2: 'School B'},
 'symbol_family': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_home': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_friends': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_choice': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_things': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_health': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_future': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_school': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_free': {1: 'Happy', 2: 'Ok', 3: 'Sad'},
 'symbol_life': {1: 'Happy', 2: 'Ok', 3: 'Sad'}}

### Randomly sample values from dictionary to populate each column

In [6]:
# Initialise dataframe with row number matching sample size
data = pd.DataFrame(index=range(1, sample_size+1))

random.seed(10)

for col in headings:
    # Identify appropriate dictionary of keys and values
    label_dict = labels[col]
    # Randomly sample with replacement from the possible values for measure
    data[col] = random.choices(list(label_dict.keys()), k=sample_size)

data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,gender,year_group,fsm,sen,ethnicity,english_additional,school
1,2,2,1,1,3,1,1,2,2,3,0,11,1,1,1,1,2
2,2,1,1,2,2,3,2,3,2,1,1,10,0,0,1,1,1
3,2,1,2,2,2,1,2,2,1,1,1,11,1,0,2,1,2
4,1,2,3,3,2,1,2,1,2,3,1,11,0,1,2,1,2
5,3,1,2,2,2,2,1,2,2,1,1,10,1,0,2,0,2


### Input some random missing data

Have chosen to exclude school from this - there should not be any pupils with no school (else there is a problem with data processing at some point, but shouldn't be possible).

In [7]:
random.seed(10)

# For each of the columns except school...
for col in data.drop('school', axis=1).columns:

    # Randomly choose how many missing data
    n_missing = random.randint(0, 10)

    # Drop that number of values, setting as NaN
    data[col] = data[col].sample(n=len(data) - n_missing)

data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,gender,year_group,fsm,sen,ethnicity,english_additional,school
1,2.0,2,1.0,1.0,,1,1.0,2.0,2.0,3.0,,11.0,1,1.0,1.0,1.0,2
2,2.0,1,1.0,2.0,2.0,3,2.0,3.0,2.0,1.0,,10.0,0,,1.0,1.0,1
3,2.0,1,2.0,2.0,2.0,1,2.0,2.0,1.0,1.0,,11.0,1,0.0,2.0,1.0,2
4,1.0,2,3.0,3.0,2.0,1,2.0,1.0,,3.0,1.0,11.0,0,1.0,,1.0,2
5,3.0,1,,2.0,2.0,2,,2.0,,1.0,1.0,10.0,1,,2.0,0.0,2


### Input some non-random missing data

I want to introduce some scenarios to check they are being managed appropriately, such as a school not having any of a certain year group.

In [8]:
# Set school 1/A to have no Year 7s
data.loc[(data['school'] == 1) & (data['year_group'] == 7), 'year_group'] = 8

# View resultant value counts
(data
 .loc[data['school'] == 1, 'year_group']
 .value_counts(dropna=False)
 .sort_index())

year_group
8.0     11
9.0      3
10.0     8
11.0     6
NaN      2
Name: count, dtype: int64

## Create label columns

This utilities the labels dictionary that we imported above.

In [9]:
# For each column, find the label dictionary
for column in data:
    label_dict = labels[column]
    # Replace the column values based on the labelling dictionary
    data[column + '_lab'] = data[column].map(label_dict)

data.head()

Unnamed: 0,symbol_family,symbol_home,symbol_friends,symbol_choice,symbol_things,symbol_health,symbol_future,symbol_school,symbol_free,symbol_life,...,symbol_school_lab,symbol_free_lab,symbol_life_lab,gender_lab,year_group_lab,fsm_lab,sen_lab,ethnicity_lab,english_additional_lab,school_lab
1,2.0,2,1.0,1.0,,1,1.0,2.0,2.0,3.0,...,Ok,Ok,Sad,,Year 11,FSM,SEN,Ethnic minority,Yes,School B
2,2.0,1,1.0,2.0,2.0,3,2.0,3.0,2.0,1.0,...,Sad,Ok,Happy,,Year 10,Non-FSM,,Ethnic minority,Yes,School A
3,2.0,1,2.0,2.0,2.0,1,2.0,2.0,1.0,1.0,...,Ok,Happy,Happy,,Year 11,FSM,Non-SEN,White British,Yes,School B
4,1.0,2,3.0,3.0,2.0,1,2.0,1.0,,3.0,...,Happy,,Sad,Female,Year 11,Non-FSM,SEN,,Yes,School B
5,3.0,1,,2.0,2.0,2,,2.0,,1.0,...,Ok,,Happy,Female,Year 10,FSM,,White British,No,School B


## Save data

In [10]:
data.to_csv(os.path.join(paths.data, paths.synthetic_data), index=False)