---
title: Rerunning Enformer pipeline on missing individuals, intervals
date: 9/2/2023
author: Sabrina Mi
---

### Split by Number Missing

In [23]:
import os
import pandas as pd

In [266]:
predictions_dir = "/home/s1mi/Br_predictions/predictions_folder/personalized_Br_selected_genes/predictions_2023-09-02/enformer_predictions"
successful_predictions = {}
for individual in os.listdir(predictions_dir):
    n_intervals = len(os.listdir(os.path.join(predictions_dir, individual, "haplotype0")))
    if n_intervals < 868:
        if n_intervals not in successful_predictions.keys():
            successful_predictions[n_intervals] = [individual]
        else:
            successful_predictions[n_intervals].append(individual)
for n_intervals in successful_predictions.keys():
    print("n individuals with", n_intervals, "predictions:", len(successful_predictions[n_intervals]))



In [257]:
with open("metadata/intervals.txt", "r") as f:
    all_intervals = f.read().splitlines()

## Write Metadata

In [258]:
for index, n_intervals in enumerate(successful_predictions.keys()):
    individuals = successful_predictions[n_intervals]
    with open(f"metadata/individuals{index}.txt", "w") as f:
        f.write("\n".join(individuals))
    individual = individuals[0]
    files = os.listdir(os.path.join(predictions_dir, individual, "haplotype0"))
    intervals = [file.replace("_predictions.h5", "") for file in files]
    missing_intervals = [interval for interval in all_intervals if interval not in intervals]
    with open(f"metadata/intervals{index}.txt", "w") as f:
        f.write("\n".join(missing_intervals))

## Write Config Files

In [259]:
import json

with open("personalized_config.json", "r") as input_json:
    parameters = json.load(input_json)

for index, n_intervals in enumerate(successful_predictions.keys()):
    parameters["individuals"] = f"/home/s1mi/Github/deep-learning-in-genomics/posts/2023-08-31-Br-personalized-prediction-on-more-genes/metadata/individuals{index}.txt"
    parameters["interval_list_file"] = f"/home/s1mi/Github/deep-learning-in-genomics/posts/2023-08-31-Br-personalized-prediction-on-more-genes/metadata/intervals{index}.txt"
    parameters["n_individuals"] = len(successful_predictions[n_intervals])
    parameters["batch_individuals"] = 5
    parameters["n_regions"] = 868 - n_intervals
    parameters["batch_regions"] = 5
    parameters["date"] = "2023-09-02"
    parameters["parsl_parameters"]["walltime"] = "00:30:00"
    with open(f"personalized_config{index}.json", "w") as output_json:
        json.dump(parameters, output_json, indent=2)


## Run Pipeline

```
module load conda
conda activate /lus/grand/projects/TFXcan/imlab/shared/software/conda_envs/enformer-predict-tools

for i in {0..2}; do
    python /home/s1mi/Github/enformer_epigenome_pipeline/enformer_predict.py --parameters /home/s1mi/Github/deep-learning-in-genomics/posts/2023-08-31-Br-personalized-prediction-on-more-genes/personalized_config${i}.json &
    sleep 20
done
```
