<a href="https://colab.research.google.com/github/kuriawaruchu/dummy-hospital/blob/main/hospital_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:


# Parameters
n_records = 5000  # adjust as needed
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 8, 31)

# Options
genders = ["Male", "Female"]
insurance_providers = [
    "Jubilee Health Insurance",
    "Old Mutual Insurance",
    "AAR Insurance",
    "APA Insurance",
    "CIC Insurance Group"
]
doctors = [f"Dr. {name}" for name in ["Achieng", "Kamau", "Omondi", "Mutiso", "Chebet", "Mwangi", "Wanjiku", "Njoroge", "Barasa", "Oduor"]]
conditions = ["Diabetes", "Hypertension", "Obesity", "Asthma", "Cancer", "Tuberculosis"]
test_results = ["Normal", "Inconclusive", "Abnormal"]

# Helper to generate random date
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# Generate data
data = []
for i in range(1, n_records + 1):
    year = random.choice([2023, 2024, 2025])
    patient_id = f"{year}/{i:03d}"
    gender = random.choice(genders)
    age = np.random.normal(loc=45, scale=15)  # distributed around 45 years
    age = int(max(0, min(90, age)))  # keep between 0–90

    insurance = random.choice(insurance_providers)
    admission_date = random_date(start_date, end_date)
    discharge_date = admission_date + timedelta(days=random.randint(1, 14))  # 1–14 days stay

    # Ensure discharge_date does not go beyond end_date
    if discharge_date > end_date:
        discharge_date = end_date

    billing_amount = max(5000, np.random.normal(loc=50000, scale=15000))  # centered around 50k
    doctor = random.choice(doctors)
    condition = random.choice(conditions)
    test_result = random.choice(test_results)

    data.append([
        patient_id, gender, age, insurance,
        admission_date.date(), discharge_date.date(),
        int(billing_amount), doctor, condition, test_result
    ])

# Create DataFrame
columns = [
    "patient_id", "gender", "age", "insurance_provider",
    "admission_date", "discharge_date", "billing_amount",
    "primary_doctor", "medical_condition", "test_result"
]
df = pd.DataFrame(data, columns=columns)

# Save to CSV for Power BI
df.to_csv("hospital_dataset.csv", index=False)

print("Dataset generated: hospital_dataset.csv")
print(df.head(10))
