In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

# Constants
N = 1000

# Means and covariance matrix for continuous variables (age, SBP, SCr, BMI, HbA1c)
mean_cont = [40, 124, 1, 27, 6]
cov_matrix = [
    [25, 5, 0.01, 2, 0.1],
    [5, 121, 0.02, 4, 0.2],
    [0.01, 0.02, 0.0004, 0.01, 0.001],
    [2, 4, 0.01, 25, 0.2],
    [0.1, 0.2, 0.001, 0.2, 0.64]
]
cont_vars = multivariate_normal.rvs(mean=mean_cont, cov=cov_matrix, size=N)

# Simulating categorical variables (Race, Education) and binary variables (Diabetes, Hypertension, Smoke, Male)
race = np.random.choice([0, 1, 2, 3, 4], N, p=[0.37, 0.23, 0.23, 0.13, 0.04])
education = np.random.choice([0, 1, 2, 3], N, p=[0.16, 0.42, 0.22, 0.20])
diabetes = np.random.choice([0, 1], N, p=[0.88, 0.12])
hypertension = np.random.choice([0, 1], N, p=[0.69, 0.31])
smoke = np.random.choice([0, 1], N, p=[0.43, 0.57])
male = np.random.choice([0, 1], N, p=[0.5, 0.5]) # Assuming a 50-50 split

# Hazard function incorporating the given hazard ratios
def hazard_function(x):
    age, race, male, diabetes, hypertension, uacr, egfr, sbp, smoke = x
    hr = 0.5*age + [1, 3.2, 4, 0.7, 1.1][race] + 1.2*male + 5.2*diabetes + 1.0*hypertension + 4.0*uacr + 2.7*egfr + 2.3*sbp + 1.8*smoke
    return hr

# Simulating time to event (kidney failure) based on the hazard function
time_to_failure = np.zeros(N)
status = np.zeros(N)
for i in range(N):
    x = (cont_vars[i, 0], race[i], male[i], diabetes[i], hypertension[i], cont_vars[i, 2], cont_vars[i, 3], cont_vars[i, 1], smoke[i])
    hr = hazard_function(x)
    time_to_failure[i] = np.random.exponential(30/hr)
    status[i] = time_to_failure[i] < 30

# Combine all variables into DataFrame
data = np.column_stack([cont_vars, diabetes, hypertension, smoke, race, education, male, time_to_failure, status])
columns = ['age', 'SBP', 'SCr', 'BMI', 'HbA1c', 'Diabetes', 'Hypertension', 'Smoke', 'Race', 'Education', 'Male', 'Time_to_Kidney_Failure', 'Status']
df = pd.DataFrame(data, columns=columns)
df['Race'] = df['Race'].astype(int).map({0: 'White', 1: 'Black', 2: 'Hispanic', 3: 'Asian', 4: 'Other'})
df['Education'] = df['Education'].astype(int).map({0: 'K-8', 1: 'High School', 2: 'Some college', 3: 'College'})

# Save to CSV
csv_file = 'simulated_data.csv'
df.to_csv(csv_file, index=False)
print(f"Saved dataset to {csv_file}")

# Print summaries
print(df['Time_to_Kidney_Failure'].describe())
print(df['Status'].value_counts())

Saved dataset to simulated_data.csv
count    1000.000000
mean        0.076490
std         0.074920
min         0.000031
25%         0.023223
50%         0.053088
75%         0.105863
max         0.567201
Name: Time_to_Kidney_Failure, dtype: float64
Status
1.0    1000
Name: count, dtype: int64
