# Cluster Buster Pipeline

This is a pipline that accomplishes the following:
1. creates training, validation, testing, and no-call data from snp metrics for the neural network (create_data.sh runs create_data.py)
2. finds the optimal model structure and trains neural network model (create_model.sh runs create_model.py)
3. renders genotype predictions on snp metrics datasets (predictions.sh runs predictions.py)
4. creates figures to visualize genotypes of each snp (plot_snp_figures.sh runs plot_snp_figures.py)

Set variables in the top cell and then run the cells following.

## Set Variables

In [None]:
conda_environment_name = "focalloss"
conda_source = "/data/$USER/conda/etc/profile.d/conda.sh"
mamba_source = "/data/$USER/conda/etc/profile.d/mamba.sh"

snp_list = "dummy/snpid_list.txt"
parquet_list = "dummy/dummy_pq_list.txt"
training_data = "dummy/dummy_train.csv"
validation_data = "dummy/dummy_val.csv"
test_data = "dummy/dummy_holdout.csv"
nocalls_data = "dummy/dummy_nocalls.csv"
training_proportion = 0.80
validation_proportion = 0.10 
snp_map = "dummy/dummy_snp_map.csv"

model_directory = "dummy"
tuner_project_name = "dummy_tuner"
model_name = "dummy_model.keras"

training_predictions = "dummy/dummy_train_predictions.csv"
validation_predictions = "dummy/dummy_val_predictions.csv"
test_predictions = "dummy/dummy_test_predictions.csv"
nocalls_predictions = "dummy/dummy_nc_predictions.csv"

figures_directory = "dummy/dummy_figs"

Set run_on_hpc to True if you want to use sbatch commands. Setting run_on_hpc to False runs commands in jupyter notebook.

In [None]:
run_on_hpc = False

## Gather and Clean Data

In [None]:
create_data_command = (
    "bash src/create_data.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "
    f"{snp_list} "
    f"{parquet_list} "
    f"{training_proportion} "
    f"{validation_proportion} "
    f"{training_data} "
    f"{validation_data} "
    f"{holdout_data} "
    f"{nocalls_data} "
    f"{snp_map} "
)

In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$create_data_command"
else:
    !{create_data_command}

## Create, Train, and Save Neural Network

In [None]:
train_model_command = (
    "bash src/create_model.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "    
    f"{model_directory} "
    f"{model_name} "
    f"{tuner_project_name} "
    f"{training_data} "
    f"{validation_data} "
)

In [None]:
if run_on_hpc:
    !sbatch --mem=40g --cpus-per-task=4 --partition=gpu --gres=gpu:v100x:2,lscratch:200 --time=1-0 --wrap="$train_model_command"
else:
    !{train_model_command}

## Render Predictions

### on training data

In [None]:
training_predictions_command = (
    "bash src/predictions.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "    
    f"{model_directory}/{model_name} "
    f"{snp_map} "
    f"{training_data} "
    f"{training_predictions} "
)

In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$training_predictions_command"
else:
    !{training_predictions_command}

### on validation data

In [None]:
validation_predictions_command = (
    "bash src/predictions.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "    
    f"{model_directory}/{model_name} "
    f"{snp_map} "
    f"{validation_data} "
    f"{validation_predictions} "
)

In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$validation_predictions_command"
else:
    !{validation_predictions_command}

### on test data

In [None]:
test_predictions_command = (
    "bash src/predictions.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "    
    f"{model_directory}/{model_name} "
    f"{snp_map} "
    f"{test_data} "
    f"{test_predictions} "
)

In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$test_predictions_command"
else:
    !{test_predictions_command}

### on no-calls

In [None]:
nocalls_predictions_command = (
    "bash src/predictions.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "    
    f"{model_directory}/{model_name} "
    f"{snp_map} "
    f"{nocalls_data} "
    f"{nocalls_predictions} "
)

In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$nocalls_predictions_command"
else:
    !{nocalls_predictions_command}

## Plot SNPs from Training, Validation, Testing, and No Call Sets

In [None]:
plot_snps_command = (
    "bash src/plot_snp_figures.sh "
    f"{conda_source} "
    f"{mamba_source} "
    f"{conda_environment_name} "   
    f"{snp_list} "
    f"{nocalls_predictions} {training_predictions} {validation_predictions} {test_predictions} "
    f"{figures_directory} "
    f"--plot_predicted"
)


In [None]:
if run_on_hpc:
    !sbatch --cpus-per-task=2 --mem=20g --time=2:00:00 --wrap="$plot_snps_command"
else:
    !{plot_snps_command}