In [11]:
import random
import uuid

import pandas as pd
from sklearn.utils import shuffle

In [12]:
# Proportions
num_rows = 1000  # Total dataset size

classes_and_proportions = [
    ("certain", 0.05),
    ("likely", 0.25),
    ("possible", 0.3),
    ("unlikely", 0.1),
    ("unclassified", 0.2),
    ("unclassifiable", 0.1),
]

numerical_columns = []
categorical_columns = [
    "gender",
    "pregnancy_status",
    "known_allergy",
    "rechallenge",
    "dechallenge",
    "severity",
    "is_serious",
    "criteria_for_seriousness",
    "action_taken",
    "outcome"
]

target_column = "class"


proportions = dict()
dfs = dict()
dicts = dict()

for x, proportion in classes_and_proportions:
    proportions[x] = int(proportion * num_rows)
    dfs[x] = pd.DataFrame()
    dicts[x] = dict()
    for column in [*numerical_columns, *categorical_columns, target_column]:
        dicts[x][column] = []


# Fields

## Categorical

In [13]:
options = {
    "gender": ["male", "female"],
    "pregnancy_status": [
        "not pregnant",
        "1st trimester",
        "2nd trimester",
        "3rd trimester",
    ],
    "known_allergy": ["yes", "no"],
    "rechallenge": ["yes", "no", "unknown", "na"],
    "dechallenge": ["yes", "no", "unknown", "na"],
    "severity": ["mild", "moderate", "severe", "fatal", "unknown"],
    "is_serious": ["yes", "no"],
    "criteria_for_seriousness": [
        "hospitalisation",
        "disability",
        "congenital anomaly",
        "life-threatening",
        "death",
    ],
    "action_taken": [
        "drug withdrawn",
        "dose reduced",
        "dose increased",
        "dose not changed",
        "not applicable",
        "unknown",
    ],
    "outcome": [
        "recovered",
        "recovered with sequelae",
        "recovering",
        "not recovered",
        "death",
        "unknown",
    ],
}

weights = {
    "gender": {
        "certain": [0.55, 0.45],
        "likely": [0.6, 0.4],
        "possible": [0.65, 0.35],
        "unlikely": [0.5, 0.5],
        "unclassified": [0.55, 0.45],
        "unclassifiable": [0.5, 0.5],
    },
    "pregnancy_status": {
        "certain": [1.0, 0.0, 0.0, 0.0],
        "likely": [0.6, 0.15, 0.15, 0.1],
        "possible": [0.4, 0.2, 0.2, 0.2],
        "unlikely": [0.1, 0.3, 0.6, 0.1],
        "unclassified": [0.2, 0.3, 0.4, 0.1],
        "unclassifiable": [0.05, 0.4, 0.05, 0.4],
    },
    "known_allergy": {
        "certain": [0.1, 0.9],
        "likely": [0.1, 0.9],
        "possible": [0.1, 0.9],
        "unlikely": [0.1, 0.9],
        "unclassified": [0.1, 0.9],
        "unclassifiable": [0.1, 0.9],
    },
    "rechallenge": {
        "certain": [0.8, 0.0, 0.1, 0.1],
        "likely": [0.0, 0.8, 0.1, 0.1],
        "possible": [0.0, 0.8, 0.1, 0.1],
        "unlikely": [0.0, 0.7, 0.2, 0.1],
        "unclassified": [0.0, 0.0, 0.5, 0.5],
        "unclassifiable": [0.0, 0.0, 0.5, 0.5],
    },
    "dechallenge": {
        "certain": [0.8, 0.0, 0.1, 0.1],
        "likely": [0.8, 0.0, 0.1, 0.1],
        "possible": [0.0, 0.8, 0.1, 0.1],
        "unlikely": [0.0, 0.7, 0.2, 0.1],
        "unclassified": [0.0, 0.0, 0.5, 0.5],
        "unclassifiable": [0.0, 0.0, 0.5, 0.5],
    },
    "severity": {
        "certain": [0.05, 0.1, 0.5, 0.3, 0.05],
        "likely": [0.1, 0.3, 0.4, 0.15, 0.05],
        "possible": [0.3, 0.45, 0.2, 0.04, 0.02],
        "unlikely": [0.5, 0.4, 0.05, 0.01, 0.04],
        "unclassified": [0.4, 0.35, 0.2, 0.02, 0.03],
        "unclassifiable": [0.35, 0.3, 0.2, 0.1, 0.05],
    },
    "is_serious": {
        "certain": [0.35, 0.65],
        "likely": [0.35, 0.65],
        "possible": [0.35, 0.65],
        "unlikely": [0.35, 0.65],
        "unclassified": [0.35, 0.65],
        "unclassifiable": [0.35, 0.65],
    },
    "criteria_for_seriousness": {
        "certain": [0.8, 0.1, 0.02, 0.15, 0.5],
        "likely": [0.65, 0.15, 0.05, 0.2, 0.05],
        "possible": [0.4, 0.2, 0.1, 0.05, 0.01],
        "unlikely": [0.1, 0.1, 0.05, 0.02, 0.0],
        "unclassified": [0.3, 0.15, 0.05, 0.05, 0.01],
        "unclassifiable": [0.2, 0.1, 0.05, 0.05, 0.05],
    },
    "action_taken": {
        "certain": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        "likely": [0.8, 0.15, 0.02, 0.03, 0.0, 0.0],
        "possible": [0.5, 0.3, 0.05, 0.15, 0.0, 0.0],
        "unlikely": [0.2, 0.4, 0.1, 0.4, 0.1, 0.0],
        "unclassified": [0.4, 0.35, 0.1, 0.25, 0.05, 0.05],
        "unclassifiable": [0.3, 0.3, 0.1, 0.3, 0.0, 0.0],
    },
    "outcome": {
        "certain": [0.7, 0.2, 0.05, 0.05, 0.1, 0.0],
        "likely": [0.6, 0.25, 0.05, 0.05, 0.05, 0.0],
        "possible": [0.5, 0.35, 0.1, 0.03, 0.01, 0.01],
        "unlikely": [0.4, 0.1, 0.05, 0.03, 0.01, 0.01],
        "unclassified": [0.5, 0.25, 0.1, 0.05, 0.01, 0.03],
        "unclassifiable": [0.4, 0.2, 0.1, 0.2, 0.05, 0.05],
    },
}

# For every column, take the weights
for column, column_weights in weights.items():
    # For every class, take the row of weights
    for class_name, weight in column_weights.items():
        # Pregnancy Status 
        if column == "pregnancy_status":
            # For every row on gender
            for gender in dicts[class_name]["gender"]:
                if gender == "female":
                    dicts[class_name]["pregnancy_status"].append(
                        random.choices(
                            options["pregnancy_status"],
                            weights=weight,
                            k=1,
                        )[0]
                    )
                else:
                    dicts[class_name]["pregnancy_status"].append("not applicable")
        
        else:
            dicts[class_name][column] = random.choices(
                population=options[column],
                weights=weight,
                k=proportions[class_name],
            )


# Classes

In [14]:
for x, proportion in classes_and_proportions:
    dicts[x]["class"] = [x] * proportions[x]

# Merge dataframes into one

In [15]:
dfs_list = []


for x, _ in classes_and_proportions:
    dfs[x] = pd.DataFrame(dicts[x])
    dfs_list.append(dfs[x])
    # print(x)
    # print("=" * 50)
    # display(dfs[x])

df = pd.concat(dfs_list)

In [16]:
# Create list of uuids
uuids = [uuid.uuid4() for _ in range(num_rows)]

df.insert(loc=0, column="patient_id", value=uuids)

In [17]:
df.head()

Unnamed: 0,patient_id,gender,pregnancy_status,known_allergy,rechallenge,dechallenge,severity,is_serious,criteria_for_seriousness,action_taken,outcome,class
0,5d78be1e-43e2-4b40-baf2-58c470a223c8,male,not applicable,no,yes,yes,severe,no,hospitalisation,drug withdrawn,recovered with sequelae,certain
1,d3a7b53e-8659-4566-a4ec-adea7592c769,male,not applicable,no,yes,yes,severe,no,hospitalisation,drug withdrawn,recovered,certain
2,42d5cb50-adf7-4aa9-a425-56d77e992957,male,not applicable,no,yes,yes,moderate,no,hospitalisation,drug withdrawn,recovered,certain
3,fb2fc92f-a885-4ebb-bfc2-bf35b50fc7a6,male,not applicable,no,unknown,yes,mild,no,death,drug withdrawn,not recovered,certain
4,5cfc3bea-be0b-49b5-992c-e6ef8496984c,female,not pregnant,yes,na,unknown,fatal,no,hospitalisation,drug withdrawn,recovered,certain


In [18]:
df = shuffle(df)

# Confirming categorical column proportions

In [19]:
for column in categorical_columns:
    display(df[column].value_counts(normalize=True).round(3))
    print("-" * 50)

gender
male      0.574
female    0.426
Name: proportion, dtype: float64

--------------------------------------------------


pregnancy_status
not applicable    0.574
not pregnant      0.153
1st trimester     0.115
2nd trimester     0.085
3rd trimester     0.073
Name: proportion, dtype: float64

--------------------------------------------------


known_allergy
no     0.891
yes    0.109
Name: proportion, dtype: float64

--------------------------------------------------


rechallenge
no         0.519
na         0.221
unknown    0.220
yes        0.040
Name: proportion, dtype: float64

--------------------------------------------------


dechallenge
no         0.332
yes        0.254
na         0.208
unknown    0.206
Name: proportion, dtype: float64

--------------------------------------------------


severity
moderate    0.367
mild        0.284
severe      0.235
fatal       0.087
unknown     0.027
Name: proportion, dtype: float64

--------------------------------------------------


is_serious
no     0.692
yes    0.308
Name: proportion, dtype: float64

--------------------------------------------------


criteria_for_seriousness
hospitalisation       0.520
disability            0.232
life-threatening      0.109
congenital anomaly    0.088
death                 0.051
Name: proportion, dtype: float64

--------------------------------------------------


action_taken
drug withdrawn      0.491
dose reduced        0.263
dose not changed    0.171
dose increased      0.055
not applicable      0.017
unknown             0.003
Name: proportion, dtype: float64

--------------------------------------------------


outcome
recovered                  0.532
recovered with sequelae    0.279
recovering                 0.083
not recovered              0.056
death                      0.034
unknown                    0.016
Name: proportion, dtype: float64

--------------------------------------------------


# Export dataset

In [None]:
df.to_csv("../server/data.csv", index=False)