In [27]:
import random
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

from faker import Faker

# Random Data Generation Functions

## Patient Name

In [28]:
def random_patient_name(gender: str) -> str:
    faker_kenya = Faker("sw")
    if gender == "female":
        return faker_kenya.name_female()
    if gender == "male":
        return faker_kenya.name_male()

## IP/OP Number

In [29]:
def random_inpatient_or_outpatient_number():
    prefixes = ["IP", "OP"]
    prefix = random.choice(prefixes)
    number = random.randint(100000, 999999)
    inpatient_or_outpatient_number = f"{prefix}-{number}"

    return inpatient_or_outpatient_number

## Age

In [30]:
def random_patient_age():
    # while True:
    #     age = int(np.random.normal(25, 10))
    #     if 0 <= age <= 70:
    #         return age
    #     else:
    #         return 30
    age = int(np.random.normal(25, 10))
    return age if 0 <= age <= 70 else 30

## Date of Birth

In [31]:
def random_patient_date_of_birth(age: int) -> str:
    today = datetime.today()
    dob = (today - timedelta(days=age * 365.25)).strftime("%Y-%m-%d")
    return dob

## Patient Address

In [32]:

def random_patient_address():
    # Towns / Cities
    towns = [
        "Nairobi",
        "Mombasa",
        "Kisumu",
        "Nakuru",
        "Eldoret",
        "Thika",
        "Naivasha",
        "Malindi",
        "Kitale",
        "Kericho",
        "Garissa",
        "Machakos",
        "Nyeri",
        "Embu",
        "Meru",
        "Kakamega",
        "Lamu",
    ]

    # Common Estates / Areas
    areas = [
        "Pipeline",
        "Kibera",
        "Kayole",
        "Kahawa",
        "Ruiru",
        "Utawala",
        "Miritini",
        "Kizingo",
        "Nyali",
        "Nyalenda",
        "Milimani",
        "Karatina",
        "Muthaiga",
        "Syokimau",
        "Makupa",
        "Likoni",
        "Shauri Moyo",
        "Kangemi",
        "Donholm",
        "Gikambura",
    ]

    area = random.choice(areas)
    town = random.choice(towns)

    return f"{area}, {town}"

## Ward/Clinic

In [33]:
def random_ward_clinic():
    wards_and_clinics_tb = [
        "TB Clinic",
        "TB Isolation Ward",
        "Pulmonology Clinic",
        "Chest Clinic",
        "Respiratory Diseases Ward",
        "Outpatient TB Department",
        "Multidrug-Resistant TB (MDR-TB) Clinic",
        "Directly Observed Treatment (DOT) Clinic",
        "TB Screening Unit",
        "TB Treatment Center",
        "Infectious Diseases Ward",
    ]

    return random.choice(wards_and_clinics_tb)

## Gender

In [34]:
def random_gender(causality_assessment_level: str):
    population = ["female", "male"]
    
    weights = {
        "certain": [0.55, 0.45],
        "likely": [0.6, 0.4],
        "possible": [0.65, 0.35],
        "unlikely": [0.5, 0.5],
        # "unclassified": [0.55, 0.45],
        # "unclassifiable": [0.5, 0.5],
    }

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Known Allergy

In [35]:
def random_known_allergy(causality_assessment_level: str) -> str | None:
    population = ["yes", "no"]

    weights = {
        "certain": [0.1, 0.9],
        "likely": [0.2, 0.8],
        "possible": [0.3, 0.7],
        "unlikely": [0.4, 0.6],
        # "unclassified": [0.5, 0.5],
        # "unclassifiable": [0.6, 0.4],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Pregnancy Status

In [36]:
def random_pregnancy_status(
    gender: str, age: int | None, dob: str | None, causality_assessment_level: str
) -> str | None:
    # if (
    #     causality_assessment_level in {"unclassified", "unclassifiable"}
    #     and random.random() >= 0.1
    # ):
    #     return None
    
    pregnancy_statuses = [
        # The "not applicable" option is set for all males
        "1st trimester",
        "2nd trimester",
        "3rd trimester",
        "not pregnant",
    ]

    weights = {
        "certain": [1.0, 0.0, 0.0, 0.0],
        "likely": [0.6, 0.15, 0.15, 0.1],
        "possible": [0.4, 0.2, 0.2, 0.2],
        "unlikely": [0.1, 0.3, 0.6, 0.1],
        # "unclassified": [0.2, 0.3, 0.4, 0.1],
        # "unclassifiable": [0.05, 0.4, 0.05, 0.4],
    }

    if gender == "male":
        return "not applicable"
    elif gender == "female":
        # Get age
        if pd.isna(age):
            if pd.notna(dob):
                # If DOB is available, calculate age
                today = datetime.today()
                dob = pd.to_datetime(dob)
                age = (today - dob).days // 365
            else:
                # If both age and DOB are missing, guess an age
                age = np.random.normal(30, 8)  # centered at 30 years

        if age < 18:
            return "not applicable"
        else:
            return random.choices(
                population=pregnancy_statuses,
                weights=weights[causality_assessment_level],
                k=1,
            )[0]


## Weight and Height

In [37]:
def random_weight_and_height(
    causality_assessment_level: str, gender: str, age: int | None, dob: str | None
):
    # if (
    #     causality_assessment_level in {"unclassified", "unclassifiable"}
    #     and random.random() >= 0.1
    # ):
    #     return None, None

    today = datetime.today()

    if pd.isna(age):
        if pd.notna(dob):
            # If DOB is available, calculate age
            dob = pd.to_datetime(dob)
            age = (today - dob).days // 365
        else:
            # If both age and DOB are missing, guess an age
            age = np.random.normal(30, 8)  # centered at 30 years

    # Set default means based on age and gender
    if age <= 5:
        weight_mean = 15 if gender == "male" else 14
        height_mean = 90 if gender == "male" else 88
    elif age <= 12:
        weight_mean = 30 if gender == "male" else 28
        height_mean = 130 if gender == "male" else 125
    elif age <= 18:
        weight_mean = 55 if gender == "male" else 50
        height_mean = 165 if gender == "male" else 160
    elif age <= 40:
        weight_mean = 70 if gender == "male" else 62
        height_mean = 175 if gender == "male" else 165
    else:
        weight_mean = 68 if gender == "male" else 60
        height_mean = 170 if gender == "male" else 160

    # Add randomness (normal distribution around the mean)
    weight = np.random.normal(loc=weight_mean, scale=5)
    height = np.random.normal(loc=height_mean, scale=5)

    # Clamp values to reasonable ranges
    weight = max(3, min(weight, 150))  # weight in kg
    height = max(45, min(height, 220))  # height in cm

    return round(weight, 1), round(height, 1)

## Dechallenge

In [None]:
def random_dechallenge(causality_assessment_level: str) -> str | None:
    population = ["yes", "no", "unknown", "na"]

    weights = {
        "certain": [0.8, 0.2, 0.0, 0.0],
        "likely": [0.7, 0.3, 0.0, 0.0],
        "possible": [0.7, 0.1, 0.1, 0.1],
    }

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Rechallenge

In [39]:
def random_rechallenge(causality_assessment_level: str) -> str | None:
    population = ["yes", "no", "unknown", "na"]

    weights = {
        "certain": [0.8, 0.2, 0.0, 0.0],
        "likely": [0.7, 0.3, 0.0, 0.0],
        "possible": [0.0, 0.8, 0.1, 0.1],
        "unlikely": [0.0, 0.7, 0.2, 0.1],
        # "unclassified": [0.0, 0.0, 0.5, 0.5],
        # "unclassifiable": [0.0, 0.0, 0.5, 0.5],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Severity

In [40]:
def random_severity(causality_assessment_level: str) -> str | None:
    population = ["mild", "moderate", "severe", "fatal", "unknown"]

    weights = {
        "certain": [0.02, 0.28, 0.12, 0.58, 0.0],
        "likely": [0.25, 0.58, 0.12, 0.02, 0.03],
        "possible": [0.25, 0.58, 0.12, 0.02, 0.03],
        "unlikely": [0.25, 0.25, 0.25, 0.25, 0.0],
        # "unclassified": [0.05, 0.05, 0.05, 0.05, 0.8],
        # "unclassifiable": [0.05, 0.05, 0.05, 0.05, 0.8],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Is Serious

In [41]:
def random_is_serious(causality_assessment_level: str) -> str | None:
    population = ["yes", "no"]

    weights = {
        "certain": [0.35, 0.65],
        "likely": [0.35, 0.65],
        "possible": [0.35, 0.65],
        "unlikely": [0.35, 0.65],
        # "unclassified": [0.50, 0.50],
        # "unclassifiable": [0.50, 0.50],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Criteria For Seriousness

In [42]:
def random_criteria_for_seriousness(causality_assessment_level: str) -> str | None:
    population = [
        "hospitalisation",
        "disability",
        "congenital anomaly",
        "life-threatening",
        "death",
    ]
    [0.55, 0.05, 0.08, 0.27, 0.05]

    weights = {
        "certain": [0.55, 0.05, 0.08, 0.27, 0.05],
        "likely": [0.55, 0.05, 0.08, 0.27, 0.05],
        "possible": [0.55, 0.05, 0.08, 0.27, 0.05],
        "unlikely": [0.55, 0.05, 0.08, 0.27, 0.05],
        # "unclassified": [0.55, 0.05, 0.08, 0.27, 0.05],
        # "unclassifiable": [0.55, 0.05, 0.08, 0.27, 0.05],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Action Taken

In [43]:
def random_action_taken(causality_assessment_level: str) -> str | None:
    population = [
        "drug withdrawn",
        "dose reduced",
        "dose increased",
        "dose not changed",
        "not applicable",
        "unknown",
    ]

    weights = {
        "certain": [0.57, 0.04, 0.04, 0.3, 0.04, 0.01],
        "likely": [0.45, 0.04, 0.04, 0.42, 0.04, 0.01],
        "possible": [0.27, 0.04, 0.04, 0.5, 0.04, 0.01],
        "unlikely": [0.17, 0.04, 0.04, 0.6, 0.04, 0.01],
        # "unclassified": [0.01, 0.04, 0.04, 0.3, 0.04, 0.57],
        # "unclassifiable": [0.01, 0.04, 0.04, 0.3, 0.04, 0.57],
    }
    
    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Outcome

In [44]:
def random_outcome(causality_assessment_level: str) -> str | None:
    population = [
        "recovered",
        "recovered with sequelae",
        "recovering",
        "not recovered",
        "death",
        "unknown",
    ]

    weights = {
        "certain": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
        "likely": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
        "possible": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
        "unlikely": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
        # "unclassified": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
        # "unclassifiable": [0.16, 0.01, 0.36, 0.08, 0.03, 0.36],
    }

    # if (
    #     causality_assessment_level in ["unclassified", "unclassifiable"]
    #     and random.random() > 0.1
    # ):
    #     return None

    return random.choices(
        population=population, weights=weights[causality_assessment_level], k=1
    )[0]

## Medicine Data

### Start/Stop Date

In [45]:
# Function to generate start and stop dates based on causality level
def get_start_stop_date(causality_level: str, created_at: datetime):
    level = causality_level.lower()
    # Expanded date ranges: (start_min, start_max, stop_min, stop_max)
    date_profiles = {
        "certain": (90, 150, 5, 14),
        "probable": (70, 120, 7, 20),
        "likely": (60, 110, 10, 25),
        "possible": (45, 100, 15, 30),
        # "unlikely": (30, 90, 15, 40),
        # "unclassified": (20, 75, 10, 35),
        "unassessable": (0, 0, 0, 0),
    }

    if level == "unassessable":
        if random.random() < 0.5:
            return None, None
        else:
            start = created_at - timedelta(days=random.randint(5, 15))
            stop = created_at - timedelta(days=random.randint(1, 10))
            return start.strftime("%Y-%m-%d"), stop.strftime("%Y-%m-%d")

    start_min, start_max, stop_min, stop_max = date_profiles.get(level, (10, 20, 1, 10))
    start_date = created_at - timedelta(days=random.randint(start_min, start_max))
    stop_date = created_at - timedelta(days=random.randint(stop_min, stop_max))
    return start_date.strftime("%Y-%m-%d"), stop_date.strftime("%Y-%m-%d")

### Manufacturer

In [46]:
def random_manufacturer():
    manufacturers = [
        "PharmaHealth Ltd",
        "Global Meds Inc",
        "Kenya Drug Co",
        "MedLife Pharmaceuticals",
        "HealthPlus Labs",
    ]

    return random.choice(manufacturers)

### Batch No

In [47]:
def random_batch_no():
    return "B" + "".join(random.choices("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=6))

### Final Medicine Data

In [48]:
# Function to generate one ADR row (flattened dict)
def random_medicine_data(causality_assessment_level: str, created_at: datetime) -> dict:
    drugs = ["rifampicin", "isoniazid", "pyrazinamide", "ethambutol"]

    
    # Dose range per drug (mg/day)
    dose_ranges = {
        "rifampicin": (300, 600),
        "isoniazid": (150, 300),
        "pyrazinamide": (1000, 2000),
        "ethambutol": (400, 1200),
    }

    reaction_pool = {
        "rifampicin": ["jaundice", "fever", "hepatotoxicity", "fatigue"],
        "isoniazid": ["tingling", "numbness", "rash", "headache"],
        "pyrazinamide": ["joint pain", "loss of appetite", "nausea", "vomiting"],
        "ethambutol": [
            "blurred vision",
            "eye pain",
            "color blindness",
            "optic neuritis",
        ],
    }

    # Weighting logic for polypharmacy
    causality_weights_for_polypharmacy = {
        "certain": (0.9, 0.1),
        "probable": (0.7, 0.3),
        "likely": (0.7, 0.3),
        "possible": (0.3, 0.7),
        # "unlikely": (0.4, 0.6),
        # "unclassified": (0.5, 0.5),
        "unassessable": (0.2, 0.8),
    }

    cl = causality_assessment_level.lower()
    mono_weight, poly_weight = causality_weights_for_polypharmacy.get(cl, (0.5, 0.5))
    is_mono = random.choices([True, False], weights=[mono_weight, poly_weight])[0]
    suspected_drugs = (
        [random.choice(drugs)]
        if is_mono
        else random.sample(drugs, k=random.randint(2, 3))
    )

    
    medicine_dict = {}


    # # Apply nullification condition for uncertain causality
    # if (
    #     cl in {"unclassified", "unclassifiable", "unassessable"}
    #     and random.random() >= 0.1
    # ):
    #     for drug in drugs:
    #         medicine_dict[f"{drug}_suspected"] = None
    #         medicine_dict[f"{drug}_start_date"] = None
    #         medicine_dict[f"{drug}_stop_date"] = None
    #         medicine_dict[f"{drug}_dose_amount"] = None
    #         medicine_dict[f"{drug}_frequency_number"] = None
    #         medicine_dict[f"{drug}_route"] = None
    #         medicine_dict[f"{drug}_batch_no"] = None
    #         medicine_dict[f"{drug}_manufacturer"] = None
    #     medicine_dict["date_of_onset_of_reaction"] = None
    #     medicine_dict["description_of_reaction"] = (
    #         "Insufficient information to determine cause"
    #     )
    #     return medicine_dict
    
    if suspected_drugs:
        likely_drug = suspected_drugs[0]
        onset_date = created_at - timedelta(days=random.randint(2, 10))

        medicine_dict["date_of_onset_of_reaction"] = onset_date.strftime("%Y-%m-%d")

        possible_reactions = reaction_pool.get(likely_drug, ["nonspecific symptoms"])
        sampled_reactions = random.sample(
            possible_reactions, k=random.randint(1, min(3, len(possible_reactions)))
        )
        reaction_string = ", ".join(sampled_reactions)

        medicine_dict["description_of_reaction"] = reaction_string
    else:
        medicine_dict["date_of_onset_of_reaction"] = None
        medicine_dict["description_of_reaction"] = (
            "Information insufficient to determine cause"
        )

    for drug in drugs:
        if drug in suspected_drugs:
            medicine_dict[f"{drug}_suspected"] = True

            # Get appropriate start and stop dates based on causality
            start, stop = get_start_stop_date(cl, created_at)
            medicine_dict[f"{drug}_start_date"] = start
            medicine_dict[f"{drug}_stop_date"] = stop

            medicine_dict[f"{drug}_dose_amount"] = random.choice(dose_ranges[drug])
            medicine_dict[f"{drug}_frequency_number"] = random.choice([1, 2, 3])
            medicine_dict[f"{drug}_route"] = random.choice(["oral", "IV"])
            medicine_dict[f"{drug}_batch_no"] = random_batch_no()
            medicine_dict[f"{drug}_manufacturer"] = random_manufacturer()
        else:
            medicine_dict[f"{drug}_suspected"] = False
            medicine_dict[f"{drug}_start_date"] = None
            medicine_dict[f"{drug}_stop_date"] = None
            medicine_dict[f"{drug}_dose_amount"] = None
            medicine_dict[f"{drug}_frequency_number"] = None
            medicine_dict[f"{drug}_route"] = None
            medicine_dict[f"{drug}_batch_no"] = None
            medicine_dict[f"{drug}_manufacturer"] = None

    return medicine_dict


In [49]:
# Setup data & helpers
def random_causality_assessment():
    causality_levels = [
        "certain",
        "likely",
        "possible",
        "unlikely",
        # "unclassified",
        # "unclassifiable",
    ]

    # causality_levels_weights = [0.05, 0.25, 0.3, 0.1, 0.2, 0.1]
    causality_levels_weights = [0.05, 0.45, 0.3, 0.2]
    return random.choices(
        population=causality_levels, weights=causality_levels_weights, k=1
    )[0]


def generate_adr_report():
    causality_level = random_causality_assessment()
    created_at = datetime.today() - timedelta(weeks=random.randint(1, 156))

    inpatient_or_outpatient_number = random_inpatient_or_outpatient_number()
    gender = random_gender(causality_level)
    patient_name = random_patient_name(gender)
    patient_address = random_patient_address()
    ward_clinic = random_ward_clinic()
    known_allergy = random_known_allergy(causality_level)

    global patient_date_of_birth
    patient_date_of_birth = None

    global patient_age
    patient_age = None

    if random.random() > 0.5:
        patient_age = random_patient_age()
    else:
        patient_date_of_birth = random_patient_date_of_birth(random_patient_age())

    patient_weight, patient_height = random_weight_and_height(
        causality_level, gender, patient_age, patient_date_of_birth
    )

    pregnancy_status = random_pregnancy_status(
        gender, patient_age, patient_date_of_birth, causality_level
    )

    dechallenge = random_dechallenge(causality_level)
    rechallenge = random_rechallenge(causality_level)
    severity = random_severity(causality_level)
    is_serious = random_is_serious(causality_level)
    criteria_for_seriousness = random_criteria_for_seriousness(causality_level)
    action_taken = random_action_taken(causality_level)
    outcome = random_outcome(causality_level)
    medicines = random_medicine_data(causality_level, created_at)

    return {
        "patient_name": patient_name,
        "inpatient_or_outpatient_number": inpatient_or_outpatient_number,
        "patient_date_of_birth": patient_date_of_birth,
        "patient_age": patient_age,
        "patient_address": patient_address,
        "ward_or_clinic": ward_clinic,
        "patient_gender": gender,
        "known_allergy": known_allergy,
        "pregnancy_status": pregnancy_status,
        "patient_weight_kg": patient_weight,
        "patient_height_cm": patient_height,
        **medicines,
        "dechallenge": dechallenge,
        "rechallenge": rechallenge,
        "severity": severity,
        "is_serious": is_serious,
        "criteria_for_seriousness": criteria_for_seriousness,
        "action_taken": action_taken,
        "outcome": outcome,
        "created_at": created_at.strftime("%Y-%m-%d"),
        "causality_assessment_level": causality_level,
    }


# Generate Rows

In [50]:
def generate_batch_reports(n=10):
    reports = []
    for i in range(n):
        reports.append(generate_adr_report())
    return reports

batch_reports = generate_batch_reports(1000)
df = pd.DataFrame(batch_reports)

In [51]:
df.head()

Unnamed: 0,patient_name,inpatient_or_outpatient_number,patient_date_of_birth,patient_age,patient_address,ward_or_clinic,patient_gender,known_allergy,pregnancy_status,patient_weight_kg,...,ethambutol_manufacturer,dechallenge,rechallenge,severity,is_serious,criteria_for_seriousness,action_taken,outcome,created_at,causality_assessment_level
0,Hamis Juma,IP-745741,,29.0,"Kangemi, Machakos",TB Screening Unit,female,no,1st trimester,67.2,...,,yes,no,mild,no,life-threatening,dose not changed,recovering,2024-05-04,likely
1,Mwalimu Sarabi Abwao,IP-732552,,28.0,"Kayole, Embu",Directly Observed Treatment (DOT) Clinic,female,no,1st trimester,63.0,...,,yes,yes,severe,yes,hospitalisation,dose not changed,death,2025-01-18,likely
2,Naki Adan,OP-722932,1984-05-24,,"Kizingo, Naivasha",TB Clinic,male,no,not applicable,71.0,...,Global Meds Inc,no,unknown,mild,no,hospitalisation,dose not changed,unknown,2022-07-09,unlikely
3,Amri Mutai Njeri,IP-439112,2016-05-24,,"Syokimau, Eldoret",Chest Clinic,female,no,not applicable,27.8,...,,no,no,moderate,yes,life-threatening,dose increased,recovering,2022-06-04,likely
4,Ntimi Atieno,IP-585090,2021-05-24,,"Kibera, Embu",TB Screening Unit,female,yes,not applicable,17.3,...,MedLife Pharmaceuticals,no,no,moderate,yes,life-threatening,drug withdrawn,recovering,2023-04-15,unlikely


# Export Data

In [52]:
df.to_csv("../server/data.csv", index=False)
df.to_csv("data.csv", index=False)