In [3]:
"""
generate_data.py
Creates:
  - donors.csv (5000 rows)
  - patients.csv (10 rows)
Fields: donor_id,name,surname,location,age,sex,blood_type,probability_score,lat,lon,phone,email
"""

import pandas as pd
import numpy as np
from faker import Faker
from tqdm import tqdm
import random

fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

N_DONORS = 5000
N_PATIENTS = 10

# Region choices with lat/lon centers (India sample)
REGIONS = [
    ("Mumbai", 19.0760, 72.8777),
    ("Goa", 15.2993, 74.1240),
    ("Pune", 18.5204, 73.8567),
    ("Nagpur", 21.1458, 79.0882),
    ("Bengaluru", 12.9716, 77.5946)
]

RARE_SURNAME_HINTS = ["Bhagat","Parab","Sawant","Fernandes","Naik"]
BLOOD_TYPES = ["A+", "A-", "B+", "B-", "O+", "O-", "AB+", "AB-", "Bombay(Oh)"]

def jitter_coords(lat, lon, scale_km=20):
    # small jitter around region center
    return lat + np.random.normal(0, 0.08), lon + np.random.normal(0, 0.08)

def make_donor(i):
    name = fake.first_name()
    surname = random.choice(["Bhagat","Parab","Sawant","Patel","Naik","Fernandes","Khan","Rao","Sharma","Singh"])
    location, lat_c, lon_c = random.choice(REGIONS)
    lat, lon = jitter_coords(lat_c, lon_c)
    age = int(np.clip(np.random.normal(32, 10), 18, 70))
    sex = random.choice(["M","F"])

    # Blood type probabilities (auto-normalized)
    if surname in RARE_SURNAME_HINTS and location in ("Goa","Mumbai"):
        probs = np.array([0.12,0.02,0.12,0.02,0.38,0.05,0.10,0.02,0.17])
    else:
        probs = np.array([0.12,0.02,0.12,0.02,0.44,0.04,0.12,0.01,0.11])
    probs = probs / probs.sum()  # normalize to exactly 1
    blood_type = np.random.choice(BLOOD_TYPES, p=probs)

    phone = fake.msisdn()[:10]
    email = fake.free_email()

    # Heuristic probability_score for rare blood type
    prob_base = 0.01
    if blood_type == "Bombay(Oh)":
        prob_base = 0.9
    if surname in RARE_SURNAME_HINTS: prob_base += 0.05
    if location in ("Goa","Mumbai"): prob_base += 0.03
    if age < 30: prob_base += 0.02
    probability_score = float(np.clip(np.random.normal(prob_base, 0.05), 0.0, 1.0))

    return {
        "donor_id": f"D_{i:05d}",
        "name": name,
        "surname": surname,
        "location": location,
        "age": age,
        "sex": sex,
        "blood_type": blood_type,
        "probability_score": round(probability_score, 3),
        "lat": round(lat,5),
        "lon": round(lon,5),
        "phone": phone,
        "email": email
    }

def make_patient(i):
    location, lat_c, lon_c = random.choice(REGIONS)
    lat, lon = jitter_coords(lat_c, lon_c)
    need = random.choice(["A+","B+","O+","Bombay(Oh)"])
    urgency = random.choice(["high","medium","low"])
    return {
        "patient_id": f"P_{i:03d}",
        "need": need,
        "region": location,
        "urgency": urgency,
        "lat": round(lat,5),
        "lon": round(lon,5),
        "hospital": fake.company()
    }

if __name__ == "__main__":
    donors = [make_donor(i) for i in tqdm(range(1, N_DONORS+1))]
    df_donors = pd.DataFrame(donors)
    df_donors.to_csv("donors.csv", index=False)
    print("Saved donors.csv:", df_donors.shape)

    patients = [make_patient(i) for i in range(1, N_PATIENTS+1)]
    df_patients = pd.DataFrame(patients)
    df_patients.to_csv("patients.csv", index=False)
    print("Saved patients.csv:", df_patients.shape)

    # quick summary
    print(df_donors["blood_type"].value_counts(normalize=True).round(3))



[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 5000/5000 [00:00<00:00, 5673.85it/s]


Saved donors.csv: (5000, 12)
Saved patients.csv: (10, 7)
blood_type
O+            0.428
B+            0.121
A+            0.120
Bombay(Oh)    0.119
AB+           0.119
O-            0.042
B-            0.022
A-            0.019
AB-           0.011
Name: proportion, dtype: float64
