# Tutorial for generating synthetic data
We use a synthea OMOP dataset to demonstrate how to train cehrgpt and generate synthetic data.  

In [None]:
# Install cehrgpt
!pip install cehrgpt --constraint constraints.txt

In [None]:
!pip install gdown

In [None]:
!gdown --fuzzy "https://drive.google.com/file/d/1k7-cZACaDNw8A1JRI37mfMAhEErxKaQJ/view?usp=share_link"

In [None]:
!mkdir omop_synthea
!mkdir omop_synthea/cehrgpt
!mkdir omop_synthea/dataset_prepared
!mkdir omop_synthea/cehrgpt/syntheic_data

In [None]:
!tar -xaf omop_synthea.tar.gz -C omop_synthea

In [None]:
%env OMOP_DIR=omop_synthea
%env CEHR_GPT_DATA_DIR=omop_synthea
%env CEHR_GPT_MODEL_DIR=omop_synthea/cehrgpt
%env SYNTHETIC_DATA_OUTPUT_DIR=omop_synthea/cehrgpt/syntheic_data

### Step 1: Generate training data

In [None]:
import subprocess
import pyspark

# Get paths
python_path = subprocess.check_output(['which', 'python']).decode().strip()
spark_home = pyspark.__file__.rsplit('/', 1)[0]

# Set environment variables using magic commands
%env SPARK_HOME=$spark_home
%env PYSPARK_PYTHON=$python_path  
%env PYSPARK_DRIVER_PYTHON=$python_path
%env SPARK_WORKER_INSTANCES=1
%env SPARK_WORKER_CORES=16
%env SPARK_EXECUTOR_CORES=2
%env SPARK_DRIVER_MEMORY=12g
%env SPARK_EXECUTOR_MEMORY=12g
%env SPARK_MASTER=local[64]

# For paths, you'll still need to use os.environ for concatenation
import os
current_pythonpath = os.environ.get('PYTHONPATH', '')
current_path = os.environ.get('PATH', '')
os.environ['PYTHONPATH'] = f"{spark_home}/python:{current_pythonpath}"
os.environ['PATH'] = f"{spark_home}/bin:{current_path}"

In [None]:
# Let's see what the script receives as arguments
!sh scripts/create_cehrgpt_pretraining_data.sh --input_folder {os.environ['OMOP_DIR']} --output_folder {os.environ['CEHR_GPT_DATA_DIR']} --start_date 1985-01-01

### Step 2: Train CEHR-GPT

In [None]:
import subprocess
import os
import sys


cmd = [
    'python', '-u', '-m', 'cehrgpt.runners.hf_cehrgpt_pretrain_runner',
    '--model_name_or_path', os.environ['CEHR_GPT_MODEL_DIR'],
    '--tokenizer_name_or_path', os.environ['CEHR_GPT_MODEL_DIR'],
    '--output_dir', os.environ['CEHR_GPT_MODEL_DIR'],
    '--data_folder', f"{os.environ['CEHR_GPT_DATA_DIR']}/patient_sequence/train",
    '--dataset_prepared_path', f"{os.environ['CEHR_GPT_DATA_DIR']}/dataset_prepared",
    '--do_train', 'true',
    '--seed', '42',
    '--dataloader_num_workers', '16',
    '--dataloader_prefetch_factor', '8',
    '--hidden_size', '768',
    '--num_hidden_layers', '12',
    '--max_position_embeddings', '1024',
    '--evaluation_strategy', 'epoch',
    '--save_strategy', 'epoch',
    '--sample_packing',
    '--max_tokens_per_batch', '16384',
    '--warmup_ratio', '0.01',
    '--weight_decay', '0.01',
    '--num_train_epochs', '50',
    '--learning_rate', '0.0002',
    '--use_early_stopping',
    '--early_stopping_threshold', '0.001'
]

# Stream output in real-time
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          universal_newlines=True, bufsize=1)

# Print output line by line as it comes
for line in process.stdout:
    print(line, end='')
    sys.stdout.flush()

# Wait for process to complete
return_code = process.wait()
print(f"\nCommand finished with return code: {return_code}")

### Step 3: Generate synthetic sequences

In [None]:
import subprocess
import os
import sys

%env TRANSFORMERS_VERBOSITY=info
%env CUDA_VISIBLE_DEVICES="0"

cmd = [
    'python', '-u', '-m', 'cehrgpt.generation.generate_batch_hf_gpt_sequence',
    '--model_folder', os.environ['CEHR_GPT_MODEL_DIR'],
    '--tokenizer_folder', os.environ['CEHR_GPT_MODEL_DIR'],
    '--output_folder', os.environ['SYNTHETIC_DATA_OUTPUT_DIR'],
    '--num_of_patients', '128',
    '--batch_size', '16',
    '--buffer_size', '128',
    '--context_window', '1024',
    '--sampling_strategy', 'TopPStrategy',
    '--top_p', '1.0',
    '--temperature', '1.0',
    '--repetition_penalty', '1.0',
    '--epsilon_cutoff', '0.00',
    '--demographic_data_path', f"{os.environ['CEHR_GPT_DATA_DIR']}/patient_sequence/train"
]

# Stream output in real-time
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          universal_newlines=True, bufsize=1)

# Print output line by line as it comes
for line in process.stdout:
    print(line, end='')
    sys.stdout.flush()

# Wait for process to complete and get return code
return_code = process.wait()
print(f"\nCommand finished with return code: {return_code}")

### Step 4: Convert to OMOP Format

In [None]:
import subprocess
import os
import sys

# Set up the command
cmd = [
    'sh', 'scripts/omop_pipeline.sh',
    f"--patient-sequence-folder={os.environ['SYNTHETIC_DATA_OUTPUT_DIR']}/top_p10000/generated_sequences/",
    f"--omop-folder={os.environ['SYNTHETIC_DATA_OUTPUT_DIR']}/top_p10000/restored_omop/",
    f"--source-omop-folder={os.environ['OMOP_DIR']}",
    '--cpu-cores=10'
]

print("Running command:")
print(' '.join(cmd))
print()

# Stream output in real-time
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          universal_newlines=True, bufsize=1)

# Print output line by line as it comes
for line in process.stdout:
    print(line, end='')
    sys.stdout.flush()

# Wait for process to complete
return_code = process.wait()
print(f"\nPipeline finished with return code: {return_code}")