---
title: Preparing pipeline inputs for Enformer predictions centered at canonical TSS"
date: 8/8/23
author: Sabrina Mi
---

### Write Gene Intervals

In [2]:
import pandas as pd

model_genes = pd.read_csv("highestR2genes.csv", header=0)
model_genes.head()

Unnamed: 0,gene,genename,pred.perf.R2,n.snps.in.model,pred.perf.pval,cor,pred.perf.qval
0,ENSRNOG00000001342,Wdr66,0.844361,3,7.502608e-93,0.918891,1.243114e-89
1,ENSRNOG00000020624,Acadsb,0.829218,3,2.602032e-88,0.910614,2.155665e-85
2,ENSRNOG00000060523,AABR07044362.6,0.820292,8,8.068512e-86,0.9057,4.456263e-83
3,ENSRNOG00000058006,Sncg,0.816705,6,7.472729999999999e-85,0.903717,3.095408e-82
4,ENSRNOG00000060045,Pi4ka,0.815282,3,1.78446e-84,0.90293,5.913374000000001e-82


In [3]:
# gene annotation
annot_df = pd.read_csv('/home/s1mi/enformer_rat_data/rn7.gene.txt', sep ='\t',  index_col='geneId')
annot_df.head(3)


  annot_df = pd.read_csv('/home/s1mi/enformer_rat_data/rn7.gene.txt', sep ='\t',  index_col='geneId')


Unnamed: 0_level_0,geneSymbol,chromosome,start,end,strand,tss,description,hasEqtl,expr_BLA,expr_Brain,...,sqtl_BLA,sqtl_Brain,sqtl_Eye,sqtl_IL,sqtl_LHb,sqtl_NAcc,sqtl_NAcc2,sqtl_OFC,sqtl_PL,sqtl_PL2
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSRNOG00000066169,ENSRNOG00000066169,1,36112690,36122387,-,36122387,,True,True,True,...,False,False,False,False,False,False,False,False,False,False
ENSRNOG00000070168,Olr56,1,157231467,157232417,+,157231467,olfactory receptor family 51 subfamily F membe...,True,False,True,...,False,False,False,False,False,False,False,False,False,False
ENSRNOG00000070901,Irgq,1,80123925,80131881,+,80123925,immunity-related GTPase Q,True,True,True,...,False,False,False,False,False,False,False,False,False,False


In [33]:
def write_intervals(gene_list, file):
    with open(file, 'a') as f:
        for gene in gene_list:
            gene_annot = annot_df.loc[gene]
            tss = gene_annot['tss']
            start = tss - 57344
            end = tss + 57344
            interval = f"chr{gene_annot['chromosome']}_{start}_{end}"
            f.write(interval + '\n')

In [4]:
gene_list = [gene for gene in model_genes['gene'][:10] if gene in annot_df.index]
with open("gene_list.txt", "w") as f:
    f.write("\n".join(gene_list))
write_intervals(gene_list, "metadata/intervals.txt")


### Write Individuals List

In [38]:
!bcftools query -l /home/s1mi/enformer_rat_data/Brain.rn7.vcf.gz > metadata/individuals.txt

I added these parameters to the config file: `/home/s1mi/Github/deep-learning-in-genomics/posts/running-enformer-on-rat-genes-at-TSS/run_on_polaris_personalized.json`

### Run Enformer

```
module load conda

conda activate /lus/grand/projects/TFXcan/imlab/shared/software/conda_envs/enformer-predict-tools

cd /home/s1mi/Github/shared_pipelines/enformer_pipeline
python3 scripts/enformer_predict.py --parameters /home/s1mi/Github/deep-learning-in-genomics/posts/running-enformer-on-rat-genes-at-TSS/run_on_polaris_personalized.json

```

### Concatenate Predictions

In [41]:
import os
import h5py

def concatenate_predictions(predictions_dir, output_path):
    with h5py.File(output_path, "w") as output_file:
        individuals = os.listdir(predictions_dir)
        for individual in individuals:
            group = output_file.create_group(individual)

            dir = os.path.join(predictions_dir, individual, "haplotype1")
            filenames = os.listdir(dir)

            for file in filenames:
                interval = file.replace("_predictions.h5", "")
                with h5py.File(os.path.join(dir, file), "r") as input_file:
                    group.create_dataset(interval, data=input_file[interval][()], shape=(896,5313))

In [42]:

predictions_dir="/home/s1mi/Github/deep-learning-in-genomics/posts/running-enformer-on-rat-genes-at-TSS/predictions_folder/personalized_Br_genes/predictions_2023-08-08/enformer_predictions"
concatenate_predictions(predictions_dir, "Br_2023-08-08_predictions.h5")