In [5]:
import pandas as pd
import numpy as np

In [13]:

num_records = 1000
np.random.seed(42)
record_ids = np.arange(1, num_records + 1) 
record_ids = 'sub-' + record_ids.astype(str)
#make age kinda right skewed
ages = np.random.chisquare(df=2, size=num_records).astype(int) + 18
ages = np.clip(ages, 18, 90)
sexes = np.random.choice(['M', 'F'], size=num_records)
edu_levels = np.random.choice(
    ['highschool', 'bachelor', 'master', 'phd'],
    size=num_records,
    p=[0.4, 0.35, 0.2, 0.05]
)
df = pd.DataFrame({
    'record_id': record_ids,
    'age': ages,
    'sex': sexes,
    'education_level': edu_levels
})

# save to csv, tsv, and txt
df.to_csv('demo_data.csv', index=False)
df.to_csv('demo_data.tsv', sep='\t', index=False)
df.to_csv('demo_data.txt', sep=' ', index=False)

In [12]:
# generate new columns with simulated data, neuropsych scores
np.random.seed(42)

np_df = pd.DataFrame({
    'record_id': record_ids,
    'memory_score': np.random.normal(loc=100, scale=15, size=num_records).astype(int),
    'attention_score': np.random.normal(loc=100, scale=15, size=num_records).astype(int),
    'executive_function_score': np.random.normal(loc=100, scale=15, size=num_records).astype(int)
})


np_df.to_csv('neuropsych_data.csv', index=False)

In [15]:
# generate simulated imaging data - but only for half of the random records
np.random.seed(42)
imaging_record_ids = np.random.choice(record_ids, size=num_records//2, replace=False)
imaging_df = pd.DataFrame({
    'record_id': imaging_record_ids,
    'hippocampus_volume': np.random.normal(loc=3500, scale=300, size=num_records//2).astype(int),
    'amygdala_volume': np.random.normal(loc=1500, scale=200, size=num_records//2).astype(int),
    'cortical_thickness': np.random.normal(loc=2.5, scale=0.3, size=num_records//2).round(2)
})

imaging_df.to_csv('imaging_data.csv', index=False)

In [18]:
# generate biomarker data - but only for hlaf of the random records
np.random.seed(32)
biomarker_record_ids = np.random.choice(record_ids, size=num_records//2, replace=False)
biomarker_df = pd.DataFrame({
    'record_id': biomarker_record_ids,
    'cytokine_a': np.random.normal(loc=50, scale=10, size=num_records//2).round(2),
    'cytokine_b': np.random.normal(loc=30, scale=5, size=num_records//2).round(2),
    'cytokine_c': np.random.normal(loc=100, scale=20, size=num_records//2).round(2)
})
biomarker_df.to_csv('biomarker_data.csv', index=False)