In [1]:
import pandas as pd
import random

# Proportions
num_rows = 1000  # Total dataset size
proportions = {
    "Certain": 0.1,
    "Probable/Likely": 0.2,
    "Possible": 0.3,
    "Unlikely": 0.2,
    "Conditional/Unclassified": 0.1,
    "Unassessable/Unclassifiable": 0.1,
}
class_sizes = {key: int(num_rows * value) for key, value in proportions.items()}


# Helper functions for conditions
def generate_certain():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": "Yes",
        "Dechallenge": random.choice(["Yes", "No"]),
        "Severity": random.choice(["Mild", "Moderate", "Severe"]),
        "Is Reaction Serious": "Yes",
        "Reason for Seriousness": random.choice(
            ["Hospitalisation", "Life threatening"]
        ),
        "Action Taken": random.choice(["Drug withdrawn", "Dose reduced"]),
        "Outcome": random.choice(["Recovered", "Recovering"]),
        "Causality Assessment": "Certain",
    }


def generate_probable_likely():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": random.choice(["No", "Unknown"]),
        "Dechallenge": random.choice(["Yes", "No"]),
        "Severity": random.choice(["Mild", "Moderate", "Severe"]),
        "Is Reaction Serious": "Yes",
        "Reason for Seriousness": random.choice(["Hospitalisation", "Disability"]),
        "Action Taken": random.choice(["Drug withdrawn", "Dose reduced"]),
        "Outcome": random.choice(["Recovered", "Recovering"]),
        "Causality Assessment": "Probable/Likely",
    }


def generate_possible():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": random.choice(["Unknown", "N/A"]),
        "Dechallenge": random.choice(["Yes", "No"]),
        "Severity": random.choice(["Mild", "Moderate", "Severe", "Unknown"]),
        "Is Reaction Serious": random.choice(["Yes", "No"]),
        "Reason for Seriousness": random.choice(
            ["Hospitalisation", "Life threatening", "None"]
        ),
        "Action Taken": random.choice(["Unknown", "Recovering", "Not recovered"]),
        "Outcome": random.choice(["Not recovered", "Recovered"]),
        "Causality Assessment": "Possible",
    }


def generate_unlikely():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": "N/A",
        "Dechallenge": random.choice(["Yes", "No"]),
        "Severity": random.choice(["Mild", "Unknown"]),
        "Is Reaction Serious": "No",
        "Reason for Seriousness": None,
        "Action Taken": random.choice(["Not applicable", "Unknown"]),
        "Outcome": "Not recovered",
        "Causality Assessment": "Unlikely",
    }


def generate_conditional_unclassified():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": random.choice(["Unknown", "N/A"]),
        "Dechallenge": random.choice(["Unknown", "N/A"]),
        "Severity": "Unknown",
        "Is Reaction Serious": "No",
        "Reason for Seriousness": None,
        "Action Taken": "Unknown",
        "Outcome": "Not recovered",
        "Causality Assessment": "Conditional/Unclassified",
    }


def generate_unassessable_unclassifiable():
    return {
        "Gender": random.choice(["Male", "Female"]),
        "Pregnancy Status": "Not applicable"
        if random.choice(["Male", "Female"]) == "Male"
        else random.choice(
            ["1st Trimester", "2nd Trimester", "3rd Trimester", "Not applicable"]
        ),
        "Rechallenge": "N/A",
        "Dechallenge": "N/A",
        "Severity": "Unknown",
        "Is Reaction Serious": "No",
        "Reason for Seriousness": None,
        "Action Taken": "Unknown",
        "Outcome": "Not recovered",
        "Causality Assessment": "Unassessable/Unclassifiable",
    }


# Generate dataframes for each class
dataframes = []
for causality, size in class_sizes.items():
    func = globals()[
        f"generate_{causality.lower().replace('/', '_').replace(' ', '_')}"
    ]
    data = [func() for _ in range(size)]
    dataframes.append(pd.DataFrame(data))

# Merge and shuffle
final_df = pd.concat(dataframes, ignore_index=True)
final_df = final_df.sample(frac=1).reset_index(drop=True)

# Save to CSV
final_df.to_csv("pharmacovigilance_dataset_with_conditions.csv", index=False)
