In [12]:
import numpy as np
import pandas as pd

NUM_RECORDS = 100000
np.random.seed(42)

# Define categorical choices
genders = ['F', 'M']
regions = ['Northeast', 'Midwest', 'South', 'West']
provider_types = ['PrimaryCare', 'Specialist', 'Hospital', 'UrgentCare']
diagnoses = ['None', 'Diabetes', 'Heart Disease', 'Orthopedic', 'Cancer']

# Sample features
age = np.random.randint(0, 90, size=NUM_RECORDS)
gender = np.random.choice(genders, size=NUM_RECORDS, p=[0.51, 0.49])  # slight female majority
region = np.random.choice(regions, size=NUM_RECORDS)
provider_type = np.random.choice(provider_types, size=NUM_RECORDS, p=[0.3, 0.3, 0.3, 0.1])
chronic_count = np.random.poisson(lam=1.5, size=NUM_RECORDS)  # most have 0-3 chronic conditions
# Ensure chronic_count is within a reasonable range
chronic_count = np.clip(chronic_count, 0, 10)
primary_dx = np.random.choice(diagnoses, size=NUM_RECORDS, 
                              p=[0.5, 0.2, 0.15, 0.1, 0.05])
num_visits = np.random.poisson(lam=2, size=NUM_RECORDS)
num_er_visits = np.random.poisson(lam=0.5, size=NUM_RECORDS)
num_inpatient_stays = np.random.poisson(lam=0.2, size=NUM_RECORDS)

# Generate claim_cost with some base factors
base_cost = 50 + age * 5 + chronic_count * 200  # base cost grows with age and conditions
# Additional cost if inpatient or many ER visits
base_cost += np.where(num_inpatient_stays > 0, 10000, 0) 
base_cost += num_er_visits * 500 
# Additional cost if provider is a hospital (assuming hospital claims are higher)
base_cost += np.where(provider_type == 'Hospital', 2000, 0)
# Add random noise
claim_cost = base_cost + np.random.normal(loc=0, scale=2000, size=NUM_RECORDS)
claim_cost = np.clip(claim_cost, 100, None)  # minimum cost 100

# Generate is_fraud (rare ~5%, with higher odds for high cost)
is_fraud = np.zeros(NUM_RECORDS, dtype=int)
# Mark some claims as fraud at random
fraud_prob = 0.05 + 0.15 * (claim_cost > 20000)  # higher fraud probability for very high cost
is_fraud = (np.random.rand(NUM_RECORDS) < fraud_prob).astype(int)

# Generate readmit_30d (higher if inpatient or chronic)
readmit = np.zeros(NUM_RECORDS, dtype=int)
readmit_prob = 0.1 + 0.2 * (num_inpatient_stays > 0) + 0.1 * (chronic_count >= 3)
readmit_30d = (np.random.rand(NUM_RECORDS) < readmit_prob).astype(int)

# Assemble into DataFrame
data = pd.DataFrame({
    'age': age, 'gender': gender, 'region': region, 'provider_type': provider_type,
    'chronic_condition_count': chronic_count, 'primary_diagnosis': primary_dx,
    'num_visits': num_visits, 'num_er_visits': num_er_visits, 'num_inpatient_stays': num_inpatient_stays,
    'claim_cost': claim_cost, 'is_fraud': is_fraud, 'readmit_30d': readmit_30d
})
# Introduce some missing values for realism
for col in ['gender', 'primary_diagnosis']:
    data.loc[data.sample(frac=0.01).index, col] = None  # ~1% missing in some categorical columns

# Save to CSV
data.to_csv('data/health_insurance_claims.csv', index=False)
print("Synthetic data saved to data/synthetic_claims.csv")


Synthetic data saved to data/synthetic_claims.csv


In [10]:
import os
print(os.getcwd())


C:\Users\tpjcw\Healthcare Claims ML
