In [1]:
import random
import uuid
from faker import Faker
import pandas as pd
from sklearn.utils import shuffle
import datetime
import numpy as np

In [2]:
# Proportions
num_rows = 1000  # Total dataset size

classes_and_proportions = [
    ("certain", 0.05),
    ("likely", 0.25),
    ("possible", 0.3),
    ("unlikely", 0.1),
    ("unclassified", 0.2),
    ("unclassifiable", 0.1),
]

numerical_columns = []
categorical_columns = [
    "gender",
    "pregnancy_status",
    "known_allergy",
    "rechallenge",
    "dechallenge",
    "severity",
    "is_serious",
    "criteria_for_seriousness",
    "action_taken",
    "outcome",
]

target_column = "causality_assessment_level"


proportions = dict()
dfs = dict()
dicts = dict()

for x, proportion in classes_and_proportions:
    proportions[x] = int(proportion * num_rows)
    dfs[x] = pd.DataFrame()
    dicts[x] = dict()
    for column in [*numerical_columns, *categorical_columns, target_column]:
        dicts[x][column] = []


# Fields


## Categorical


In [3]:
options = {
    "gender": ["male", "female"],
    "pregnancy_status": [
        "not pregnant",
        "1st trimester",
        "2nd trimester",
        "3rd trimester",
    ],
    "known_allergy": ["yes", "no"],
    "rechallenge": ["yes", "no", "unknown", "na"],
    "dechallenge": ["yes", "no", "unknown", "na"],
    "severity": ["mild", "moderate", "severe", "fatal", "unknown"],
    "is_serious": ["yes", "no"],
    "criteria_for_seriousness": [
        "hospitalisation",
        "disability",
        "congenital anomaly",
        "life-threatening",
        "death",
    ],
    "action_taken": [
        "drug withdrawn",
        "dose reduced",
        "dose increased",
        "dose not changed",
        "not applicable",
        "unknown",
    ],
    "outcome": [
        "recovered",
        "recovered with sequelae",
        "recovering",
        "not recovered",
        "death",
        "unknown",
    ],
}

weights = {
    "gender": {
        "certain": [0.55, 0.45],
        "likely": [0.6, 0.4],
        "possible": [0.65, 0.35],
        "unlikely": [0.5, 0.5],
        "unclassified": [0.55, 0.45],
        "unclassifiable": [0.5, 0.5],
    },
    "pregnancy_status": {
        "certain": [1.0, 0.0, 0.0, 0.0],
        "likely": [0.6, 0.15, 0.15, 0.1],
        "possible": [0.4, 0.2, 0.2, 0.2],
        "unlikely": [0.1, 0.3, 0.6, 0.1],
        "unclassified": [0.2, 0.3, 0.4, 0.1],
        "unclassifiable": [0.05, 0.4, 0.05, 0.4],
    },
    "known_allergy": {
        "certain": [0.1, 0.9],
        "likely": [0.1, 0.9],
        "possible": [0.1, 0.9],
        "unlikely": [0.1, 0.9],
        "unclassified": [0.1, 0.9],
        "unclassifiable": [0.1, 0.9],
    },
    "rechallenge": {
        "certain": [0.8, 0.0, 0.1, 0.1],
        "likely": [0.0, 0.8, 0.1, 0.1],
        "possible": [0.0, 0.8, 0.1, 0.1],
        "unlikely": [0.0, 0.7, 0.2, 0.1],
        "unclassified": [0.0, 0.0, 0.5, 0.5],
        "unclassifiable": [0.0, 0.0, 0.5, 0.5],
    },
    "dechallenge": {
        "certain": [0.8, 0.0, 0.1, 0.1],
        "likely": [0.8, 0.0, 0.1, 0.1],
        "possible": [0.0, 0.8, 0.1, 0.1],
        "unlikely": [0.0, 0.7, 0.2, 0.1],
        "unclassified": [0.0, 0.0, 0.5, 0.5],
        "unclassifiable": [0.0, 0.0, 0.5, 0.5],
    },
    "severity": {
        "certain": [0.05, 0.1, 0.5, 0.3, 0.05],
        "likely": [0.1, 0.3, 0.4, 0.15, 0.05],
        "possible": [0.3, 0.45, 0.2, 0.04, 0.02],
        "unlikely": [0.5, 0.4, 0.05, 0.01, 0.04],
        "unclassified": [0.4, 0.35, 0.2, 0.02, 0.03],
        "unclassifiable": [0.35, 0.3, 0.2, 0.1, 0.05],
    },
    "is_serious": {
        "certain": [0.35, 0.65],
        "likely": [0.35, 0.65],
        "possible": [0.35, 0.65],
        "unlikely": [0.35, 0.65],
        "unclassified": [0.35, 0.65],
        "unclassifiable": [0.35, 0.65],
    },
    "criteria_for_seriousness": {
        "certain": [0.8, 0.1, 0.02, 0.15, 0.5],
        "likely": [0.65, 0.15, 0.05, 0.2, 0.05],
        "possible": [0.4, 0.2, 0.1, 0.05, 0.01],
        "unlikely": [0.1, 0.1, 0.05, 0.02, 0.0],
        "unclassified": [0.3, 0.15, 0.05, 0.05, 0.01],
        "unclassifiable": [0.2, 0.1, 0.05, 0.05, 0.05],
    },
    "action_taken": {
        "certain": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
        "likely": [0.8, 0.15, 0.02, 0.03, 0.0, 0.0],
        "possible": [0.5, 0.3, 0.05, 0.15, 0.0, 0.0],
        "unlikely": [0.2, 0.4, 0.1, 0.4, 0.1, 0.0],
        "unclassified": [0.4, 0.35, 0.1, 0.25, 0.05, 0.05],
        "unclassifiable": [0.3, 0.3, 0.1, 0.3, 0.0, 0.0],
    },
    "outcome": {
        "certain": [0.7, 0.2, 0.05, 0.05, 0.1, 0.0],
        "likely": [0.6, 0.25, 0.05, 0.05, 0.05, 0.0],
        "possible": [0.5, 0.35, 0.1, 0.03, 0.01, 0.01],
        "unlikely": [0.4, 0.1, 0.05, 0.03, 0.01, 0.01],
        "unclassified": [0.5, 0.25, 0.1, 0.05, 0.01, 0.03],
        "unclassifiable": [0.4, 0.2, 0.1, 0.2, 0.05, 0.05],
    },
}

# # For every column, take the weights
# for column, column_weights in weights.items():
#     # For every class, take the row of weights
#     for class_name, weight in column_weights.items():
#         # Pregnancy Status
#         if column == "pregnancy_status":
#             # For every row on gender
#             for gender in dicts[class_name]["gender"]:
#                 if gender == "female":
#                     dicts[class_name]["pregnancy_status"].append(
#                         random.choices(
#                             options["pregnancy_status"],
#                             weights=weight,
#                             k=1,
#                         )[0]
#                     )
#                 else:
#                     dicts[class_name]["pregnancy_status"].append("not applicable")

#         else:
#             dicts[class_name][column] = random.choices(
#                 population=options[column],
#                 weights=weight,
#                 k=proportions[class_name],
#             )

# Generate 'gender' first
for class_name, proportion in proportions.items():
    dicts[class_name]["gender"] = random.choices(
        options["gender"], weights=weights["gender"][class_name], k=proportion
    )

# Generate pregnancy_status based on gender
for class_name, proportion in proportions.items():
    genders = dicts[class_name]["gender"]
    for gender in genders:
        if gender == "female":
            dicts[class_name]["pregnancy_status"].append(
                random.choices(
                    options["pregnancy_status"],
                    weights=weights["pregnancy_status"][class_name],
                    k=1,
                )[0]
            )
        else:
            dicts[class_name]["pregnancy_status"].append("not applicable")

# Now generate the other independent categorical columns
for column in [
    "known_allergy",
    "rechallenge",
    "dechallenge",
    "severity",
    "is_serious",
    "criteria_for_seriousness",
    "action_taken",
    "outcome",
]:
    for class_name, proportion in proportions.items():
        dicts[class_name][column] = random.choices(
            options[column], weights=weights[column][class_name], k=proportion
        )


# Classes


In [4]:
for class_name, proportion in proportions.items():
    dicts[class_name]["causality_assessment_level"] = [class_name] * proportion

# Merge dataframes into one


In [5]:
dfs_list = []

for x, _ in classes_and_proportions:
    dfs[x] = pd.DataFrame(dicts[x])
    dfs_list.append(dfs[x])
    # print(x)
    # print("=" * 50)
    # display(dfs[x])

df = pd.concat(dfs_list)

# Additional columns that run through the whole dataset


## Ward/Clinic

In [6]:
wards_and_clinics_tb = [
    "TB Clinic",
    "TB Isolation Ward",
    "Pulmonology Clinic",
    "Chest Clinic",
    "Respiratory Diseases Ward",
    "Outpatient TB Department",
    "Multidrug-Resistant TB (MDR-TB) Clinic",
    "Directly Observed Treatment (DOT) Clinic",
    "TB Screening Unit",
    "TB Treatment Center",
    "Infectious Diseases Ward",
]

wards_or_clinics = [random.choice(wards_and_clinics_tb) for _ in range(num_rows)]
df.insert(loc=0, column="ward_or_clinic", value=wards_or_clinics)

## Inpatient/Outpatient Number


In [7]:
prefixes = ["IP", "OP"]
in_op_numbers = []

for _ in range(num_rows):  # change 10 to however many strings you want
    prefix = random.choice(prefixes)
    number = random.randint(100000, 999999)  # 6-digit random number
    in_op_numbers.append(f"{prefix}-{number}")

df.insert(loc=0, column="inpatient_or_outpatient_number", value=in_op_numbers)

## Address

In [8]:
# Towns / Cities
towns = [
    "Nairobi",
    "Mombasa",
    "Kisumu",
    "Nakuru",
    "Eldoret",
    "Thika",
    "Naivasha",
    "Malindi",
    "Kitale",
    "Kericho",
    "Garissa",
    "Machakos",
    "Nyeri",
    "Embu",
    "Meru",
    "Kakamega",
    "Lamu",
]

# Common Estates / Areas
areas = [
    "Pipeline",
    "Kibera",
    "Kayole",
    "Kahawa",
    "Ruiru",
    "Utawala",
    "Miritini",
    "Kizingo",
    "Nyali",
    "Nyalenda",
    "Milimani",
    "Karatina",
    "Muthaiga",
    "Syokimau",
    "Makupa",
    "Likoni",
    "Shauri Moyo",
    "Kangemi",
    "Donholm",
    "Gikambura",
]


def generate_address():
    area = random.choice(areas)
    town = random.choice(towns)
    return f"{area}, {town}"


# Example: Generate 10 random addresses
addresses = [generate_address() for _ in range(num_rows)]

df.insert(loc=0, column="patient_address", value=addresses)

## Date of Birth/Age

In [9]:
age_df = pd.DataFrame(
    {"patient_date_of_birth": [None] * num_rows, "patient_age": [None] * num_rows}
)

dates_of_birth = []
ages = []


# Function to generate a random age from normal distribution centered at 28
def generate_age():
    while True:
        age = int(np.random.normal(25, 10))
        if 0 <= age <= 70:
            return age
        else:
            return 30


today = datetime.datetime.today()

for _ in range(num_rows):
    age = generate_age()
    if random.random() > 0.5:
        dob = (today - datetime.timedelta(days=age * 365.25)).strftime("%Y-%m-%d")
        dates_of_birth.append(dob)
        ages.append(None)
    else:
        dates_of_birth.append(None)
        ages.append(int(age))


df.insert(loc=0, column="patient_date_of_birth", value=dates_of_birth)
df.insert(loc=0, column="patient_age", value=ages)

# Weight and Height

In [10]:
def guess_weight_height(row):
    gender = row["gender"]
    age = row["patient_age"]
    dob = row["patient_date_of_birth"]

    if pd.isna(age):
        if pd.notna(dob):
            # If DOB is available, calculate age
            dob = pd.to_datetime(dob)
            age = (today - dob).days // 365
        else:
            # If both age and DOB are missing, guess an age
            age = np.random.normal(30, 8)  # centered at 30 years

    # Set default means
    if age <= 5:
        weight_mean = 15 if gender == "male" else 14
        height_mean = 90 if gender == "male" else 88
    elif age <= 12:
        weight_mean = 30 if gender == "male" else 28
        height_mean = 130 if gender == "male" else 125
    elif age <= 18:
        weight_mean = 55 if gender == "male" else 50
        height_mean = 165 if gender == "male" else 160
    elif age <= 40:
        weight_mean = 70 if gender == "male" else 62
        height_mean = 175 if gender == "male" else 165
    else:
        weight_mean = 68 if gender == "male" else 60
        height_mean = 170 if gender == "male" else 160

    # Add randomness (normal distribution around the mean)
    weight = np.random.normal(loc=weight_mean, scale=5)
    height = np.random.normal(loc=height_mean, scale=5)

    # Clamp values to reasonable ranges
    weight = max(3, min(weight, 150))  # weight in kg
    height = max(45, min(height, 220))  # height in cm

    return pd.Series([round(weight, 1), round(height, 1)])


# Now apply the function
df[["patient_weight_kg", "patient_height_cm"]] = df.apply(guess_weight_height, axis=1)

# Reorder columns
new_order = ["patient_weight_kg", "patient_height_cm"] + [
    col for col in df.columns if col not in ["patient_weight_kg", "patient_height_cm"]
]

df = df[new_order]

## Patient Name


In [11]:
faker_kenya = Faker("sw")
patient_names = [faker_kenya.name() for _ in range(num_rows)]

df.insert(loc=0, column="patient_name", value=patient_names)


# Display dataframe


In [12]:
df.head()

Unnamed: 0,patient_name,patient_weight_kg,patient_height_cm,patient_age,patient_date_of_birth,patient_address,inpatient_or_outpatient_number,ward_or_clinic,gender,pregnancy_status,known_allergy,rechallenge,dechallenge,severity,is_serious,criteria_for_seriousness,action_taken,outcome,causality_assessment_level
0,Imara Wario,61.5,164.2,,2001-05-18,"Kibera, Nairobi",IP-586312,Outpatient TB Department,female,not pregnant,no,yes,yes,fatal,no,life-threatening,drug withdrawn,recovered,certain
1,Kipepeo Maina,70.3,177.5,22.0,,"Kahawa, Nairobi",OP-218992,Chest Clinic,male,not applicable,no,na,yes,severe,no,life-threatening,drug withdrawn,recovered,certain
2,Sadiki Muthoni Kamau,55.3,158.2,,2007-05-19,"Donholm, Meru",IP-646437,Pulmonology Clinic,female,not pregnant,no,na,unknown,severe,no,life-threatening,drug withdrawn,recovered,certain
3,Usiku Nazari Nazari,70.8,171.1,,2006-05-18,"Likoni, Kakamega",OP-214540,Infectious Diseases Ward,male,not applicable,no,yes,na,fatal,yes,hospitalisation,drug withdrawn,recovered,certain
4,Jina Kimani,72.0,173.0,,1993-05-18,"Muthaiga, Nakuru",OP-228506,TB Screening Unit,male,not applicable,no,yes,yes,fatal,yes,hospitalisation,drug withdrawn,recovered,certain


# Confirming categorical column proportions


In [13]:
for column in categorical_columns:
    display(df[column].value_counts(normalize=True).round(3))
    print("-" * 50)

gender
male      0.575
female    0.425
Name: proportion, dtype: float64

--------------------------------------------------


pregnancy_status
not applicable    0.575
not pregnant      0.152
1st trimester     0.103
2nd trimester     0.102
3rd trimester     0.068
Name: proportion, dtype: float64

--------------------------------------------------


known_allergy
no     0.909
yes    0.091
Name: proportion, dtype: float64

--------------------------------------------------


rechallenge
no         0.489
na         0.248
unknown    0.227
yes        0.036
Name: proportion, dtype: float64

--------------------------------------------------


dechallenge
no         0.309
yes        0.237
unknown    0.228
na         0.226
Name: proportion, dtype: float64

--------------------------------------------------


severity
moderate    0.333
mild        0.292
severe      0.245
fatal       0.095
unknown     0.035
Name: proportion, dtype: float64

--------------------------------------------------


is_serious
no     0.65
yes    0.35
Name: proportion, dtype: float64

--------------------------------------------------


criteria_for_seriousness
hospitalisation       0.538
disability            0.218
congenital anomaly    0.115
life-threatening      0.094
death                 0.035
Name: proportion, dtype: float64

--------------------------------------------------


action_taken
drug withdrawn      0.531
dose reduced        0.231
dose not changed    0.160
dose increased      0.053
not applicable      0.018
unknown             0.007
Name: proportion, dtype: float64

--------------------------------------------------


outcome
recovered                  0.544
recovered with sequelae    0.275
recovering                 0.075
not recovered              0.064
death                      0.029
unknown                    0.013
Name: proportion, dtype: float64

--------------------------------------------------


# Shuffle dataframe in preparation for prediction


In [14]:
df = shuffle(df)

# Export dataset to server and ml model


In [15]:
df.to_csv("../server/data.csv", index=False)
df.to_csv("data.csv", index=False)