---
title: Running Enformer over reference genome
description: We use all intervals along chromosomes 5 through 7 and 8 through 11
author: Sabrina Mi
date: 7/21/23
format:
  html:
    code-fold: true
---

### Collect all intervals

In [1]:
#| code-fold: show
def write_intervals(chromosome, length):
    start = 0
    increment = 57344
    end = 114688
    with open(f"metadata/chr{chromosome}_intervals.txt", 'a') as f:
        while end <= length:
            f.write(f'"chr{chromosome}_{start}_{end}"\n')
            start += increment
            end += increment
        f.write(f'"chr{chromosome}_{start}_{end}"')


In [2]:
#| code-fold: show
write_intervals(chromosome = 5, length = 181538259)
write_intervals(chromosome = 6, length = 170805979)
write_intervals(chromosome = 7, length = 159345973)
# write_intervals(chromosome = 8, length = 145138636)
# write_intervals(chromosome = 9, length = 138394717)
# write_intervals(chromosome = 10, length = 133797422)
# write_intervals(chromosome = 11, length = 135086622)

## Concatenate Intervals

Functions to concatenate enformer predictions can be expanded:

In [1]:
import os
import h5py
import numpy as np
import parsl
from parsl.app.app import python_app
from parsl.configs.local_threads import config
parsl.load(config)

L = 114688
L2 = L // 2
L4 = L // 4

num_tracks = 5313
num_bins = 896

Q = num_bins // 4

chr_lengths = {"5": 181538259, "6": 170805979, "7":159345973}

In [6]:

def get_chr_ints_list(chr_num, chr_len,):
    intervals = []
    
    # determine number of tiles
    num_tiles = chr_len // L2
    if L2*(num_tiles)+L4 < chr_len:
        num_tiles += 1

    # create intervals
    for i in range(num_tiles):
        start = L2 * i
        end = L2 * (i+2)
        interval = f'chr{chr_num}_{start}_{end}'
        intervals.append(interval)
            
    return intervals

@python_app
def concatenate_predictions(predictions_folder, concat_result_file, chr_num, chr_len):
    intervals = get_chr_ints_list(chr_num, chr_len)
    dest_data_name = f'chr{chr_num}'

    # create empty dataset
    with h5py.File(concat_result_file, "w") as dest_file: # Create or open the new big HDF5 file in write mode
        last_interval_start = int(intervals[-1].split('_')[1])
        num_last_bins = ((chr_len-(last_interval_start+L4)) // 128) + 1
        data_length = (Q*3) + ((len(intervals)-2)*Q*2) + num_last_bins # Calculate the total length of the dataset
        dest_file.create_dataset(name=dest_data_name, fillvalue=np.nan, shape=(data_length, 5313), chunks=(1000, 5313))
        
        # try removing chunks

    # append to dataset
    with h5py.File(concat_result_file, "a") as dest_file:

        # first tile
        first_tile = f'{predictions_folder}/{intervals[0]}_predictions.h5'
        if os.path.exists(first_tile):
            with h5py.File(first_tile, "r") as f: # Open first tile HDF5 file in read mode
                tile_data = f[intervals[0]][()] # Access the dataset within the group
                data_to_concat = tile_data[:Q*3] # Extract the first 3/4ths of the dataset
        else:
            data_to_concat = np.full((Q*2, 5313), np.nan) # first 2 Qs are nan
            second_tile = f'{predictions_folder}/{intervals[1]}_predictions.h5'
            if os.path.exists(second_tile):
                with h5py.File(second_tile, "r") as f: # Open first tile HDF5 file in read mode
                    tile_data = f[intervals[1]][()] # Access the np array within the group
                    next_tileQ = tile_data[:Q] # Extract first Q
                    data_to_concat = np.concatenate((data_to_concat, next_tileQ), axis=0) # 3rd Q is from second tile
            else:
                data_to_concat = np.full((Q*3, 5313), np.nan) # all 3 Qs are nan

        dest_file[dest_data_name][:Q*3] = data_to_concat # add to dataset

        # middle tiles
        for i in range(1, len(intervals)-1):
            curr_tile = f'{predictions_folder}/{intervals[i]}_predictions.h5'
            if os.path.exists(curr_tile):        
                with h5py.File(curr_tile, "r") as f: # Open each source HDF5 file in read mode
                    tile_data = f[intervals[i]][()] # Access the dataset within the group
                    data_to_concat = tile_data[Q:Q*3] # Extract the middle section of the dataset
            else:
                prev_tile = f'{predictions_folder}/{intervals[i-1]}_predictions.h5'
                next_tile = f'{predictions_folder}/{intervals[i+1]}_predictions.h5'

                if os.path.exists(prev_tile): # if prev tile exists
                    with h5py.File(prev_tile, "r") as f:
                        tile_data = f[intervals[i-1]][()]
                        data_to_concat = tile_data[-Q:] # data starts with last Q from prev tile
                else:
                    data_to_concat = np.full((Q, 5313), np.nan) # datas first Q is nan
                
                if os.path.exists(next_tile): # if next tile exists
                    with h5py.File(next_tile, "r") as f:
                        tile_data = f[intervals[i+1]][()]
                        data_to_concat = np.concatenate((data_to_concat, tile_data[:Q]), axis=0) # data ends with first Q from next tile
                else:
                    data_to_concat = np.concatenate((data_to_concat, np.full((Q, 5313), np.nan)), axis=0) # datas last Q is nan
            
            dest_file[dest_data_name][Q*(1+2*i):Q*(3+2*i)] = data_to_concat
        

        # last tile
        last_tile = f'{predictions_folder}/{intervals[-1]}_predictions.h5'
        if os.path.exists(last_tile):
            with h5py.File(last_tile, "r") as f:
                tile_data = f[intervals[-1]][()]
                data_to_concat = tile_data[Q:Q+num_last_bins]
        else:
            prev_tile = f'{predictions_folder}/{intervals[-2]}_predictions.h5'
            if os.path.exists(prev_tile):
                with h5py.File(prev_tile, "r") as f:
                    tile_data = f[intervals[-2]][()]
                    if num_last_bins > Q:
                        data_to_concat = tile_data[-Q:] # data starts with last Q from prev tile 
                        data_to_concat = np.concatenate((data_to_concat, np.full((num_last_bins-Q, 5313), np.nan)), axis=0) # ends with nan
                    else:
                        data_to_concat = tile_data[Q*3:(Q*3)+num_last_bins] # data ends with prev tile 3Q to end of chrom
            else:
                data_to_concat = np.full((num_last_bins, 5313), np.nan) # all last bins are nan
        dest_file[dest_data_name][-num_last_bins:] = data_to_concat

In [None]:
#| code-fold: show
app_futures = []
for chr in chr_lengths.keys():
    predictions_folder = f"/grand/TFXcan/imlab/users/sabrina/reftile_project/predictions_folder/chr{chr}_reference_overlapping_regions/predictions_2023-08-26/enformer_predictions/chr{chr}_reference/haplotype0"
    output = f"/grand/TFXcan/imlab/users/lvairus/reftile_project/enformer-reference-epigenome/chr{chr}_cat.h5"
    app_futures.append(concatenate_predictions(predictions_folder, output, chr, chr_lengths[chr]))

exec_futures = [q.result() for q in app_futures]
