---
title: Running Enformer pipeline on 3 well-predicted genes
author: Sabrina Mi
date: 7/26/23

---



### Select Well-Predicted Genes

We used R2 values from [ElasticNet training](https://sabrina-mi.github.io/PTRS-Rat-Analysis/Br_EN_Validation)

![Well-Predicted Genes](WellPredGenes.png)

### Collect Gene Intervals

In [1]:
import pandas as pd
gene_annot = pd.read_csv('/home/s1mi/enformer_rat_data/annotation/rn7.gene.txt', sep ='\t',  index_col='geneId')
gene_list = ['ENSRNOG00000001342', 'ENSRNOG00000020624', 'ENSRNOG00000058006', 'ENSRNOG00000060045']

def write_intervals(gene_list, file):
    with open(file, 'a') as f:
        for gene in gene_list:
            print(gene)
            row = gene_annot.loc[gene]
            interval = f"chr{row['chromosome']}_{row['start']}_{row['end']}"
            f.write(interval + '\n')

write_intervals(gene_list, "metadata/intervals.txt")

    

ENSRNOG00000001342
ENSRNOG00000020624
ENSRNOG00000058006
ENSRNOG00000060045


  gene_annot = pd.read_csv('/home/s1mi/enformer_rat_data/annotation/rn7.gene.txt', sep ='\t',  index_col='geneId')


### Concatenate h5 outputs

In [11]:
predictions_dir="/home/s1mi/enformer_rat_pipeline/predictions_folder/personalized_enformer_minimal_some_regions/predictions_2023-07-26/enformer_predictions"
import os
import h5py


In [12]:
def concatenate_predictions(predictions_dir, output_path):
    with h5py.File(output_path, "w") as output_file:
        individuals = os.listdir(predictions_dir)
        for individual in individuals:
            group = output_file.create_group(individual)

            dir = os.path.join(predictions_dir, individual, "haplotype1")
            filenames = os.listdir(dir)

            for file in filenames:
                interval = file.replace("_predictions.h5", "")
                with h5py.File(os.path.join(dir, file), "r") as input_file:
                    group.create_dataset(interval, data=input_file[interval][()], shape=(896,5313))
            


    

In [10]:
concatenate_predictions(predictions_dir, "/home/s1mi/enformer_rat_pipeline/Br_enformer_predictions.h5")

### Calculate predicted gene expression

In [None]:
def calculate_gene_expression(matrix, target_interval, tss):
    window_coords = target_interval.resize(SEQUENCE_LENGTH)
    low_range = window_coords.start
    CAGE_predictions = matrix[:, 4980]
    
    bin_start = low_range + ((768 + 320) * 128)
    count = -1
    while bin_start < tss:
      bin_start = bin_start + 128
      count += 1
    if count >= len(cage_predictions)-1:
      continue
    cage_preds = cage_predictions[count - 1] + cage_predictions[count] + cage_predictions[count + 1]
