In [None]:
from faker import Faker
import pandas as pd
import random
from datetime import datetime, timedelta

def generate_synthetic_rcm_data(n_visits=1000000, max_procs_per_visit=4):
    fake = Faker()
    carriers = [f"Carrier{i}" for i in range(1, 20)]
    procedure_params = {
        '99213': {'prob': 0.5, 'charge_amt': 120, 'allowed_range': (80, 100)},
        '99214': {'prob': 0.3, 'charge_amt': 150, 'allowed_range': (100, 120)},
        '93000': {'prob': 0.2, 'charge_amt': 200, 'allowed_range': (150, 170)}
    }
    codes = list(procedure_params.keys())
    weights = [procedure_params[c]['prob'] for c in codes]

    # Build per-carrier allowed ranges
    allowed_ranges = {
        carrier: {
            code: tuple(sorted([
                random.uniform(*procedure_params[code]['allowed_range']),
                random.uniform(*procedure_params[code]['allowed_range'])
            ]))
            for code in codes
        }
        for carrier in carriers
    }

    # Denial parameters
    denial_rate = 0.20
    denial_group_weights = {
        'eligibility': 0.25,
        'coding': 0.25,
        'benefits': 0.20,
        'timely_filing': 0.15,
        'other': 0.15
    }
    denial_codes_by_group = {
        'eligibility':   {'CARC1': 0.6, 'CARC2': 0.4},
        'coding':        {'CARC3': 0.7, 'CARC4': 0.3},
        'benefits':      {'CARC5': 1.0},
        'timely_filing': {'CARC6': 0.5, 'CARC7': 0.5},
        'other':         {'CARC8': 0.5, 'CARC9': 0.5}
    }

    start_date = datetime(2024, 1, 1)
    records = []

    for visit_idx in range(n_visits):
        visit_id = f"{100000 + visit_idx}"
        svc_date = (start_date + timedelta(days=random.randint(0, 545))).date()
        payer = random.choice(carriers)
        physician = fake.name()
        clinic = random.choice(['Northside Clinic', 'Downtown Health', 'Lakeside Medical'])

        # Determine number of procedures for this visit (distinct codes)
        n_procs = random.randint(1, min(max_procs_per_visit, len(codes)))
        procs = random.sample(codes, k=n_procs)

        for code in procs:
            charge_amt = procedure_params[code]['charge_amt']
            lo, hi = allowed_ranges[payer][code]
            allowed_amt = random.uniform(lo, hi)

            # Denial logic
            is_denied = random.random() < denial_rate
            if is_denied:
                group = random.choices(
                    list(denial_group_weights.keys()),
                    weights=list(denial_group_weights.values()), k=1
                )[0]
                dc_w = denial_codes_by_group[group]
                denial_code = random.choices(
                    list(dc_w.keys()), weights=list(dc_w.values()), k=1
                )[0]
                paid_amt = 0.0
                appeal_success = random.random() < 0.3
            else:
                denial_code = None
                paid_amt = random.uniform(allowed_amt * 0.8, allowed_amt)
                appeal_success = None

            adjust_amt = charge_amt - paid_amt

            records.append({
                'visit_id':       visit_id,
                'procedure_code': code,
                'svc_date':       svc_date,
                'payer_company':  payer,
                'payer_type':     'Commercial',
                'clinic':         clinic,
                'physician':      physician,
                'charge_amt':     charge_amt,
                'allowed_amt':    round(allowed_amt, 2),
                'adjust_amt':     round(adjust_amt, 2),
                'paid_amt':       round(paid_amt, 2),
                'denial_code':    denial_code,
                'appeal_success': appeal_success
            })

    return pd.DataFrame(records)

if __name__ == "__main__":
    df = generate_synthetic_rcm_data(n_visits=1000000, max_procs_per_visit=4)
    print(df.head(15))


In [None]:
df