---
title: Concatenating HS Founder Epigenomes
author: Sabrina Mi
date: 12/6/2023
---

## Initialize Gene-Interval Mapping

In [1]:
import pandas as pd
import numpy as np
import h5py
import os

In [2]:

with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/Br_expressed_genes.txt", "r") as f:
    gene_list = f.read().splitlines()
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/metadata/intervals.txt", "r") as f:
    intervals = f.read().splitlines()
mapping = pd.DataFrame({"gene": gene_list, "interval": intervals})
mapping['chromosome'] = mapping['interval'].str.split('_').str[0].str[3:]


## Concatenate by Chromosome

In [3]:
predictions_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/predictions_folder/HS_founders_genome_wide/predictions_2023-11-27/enformer_predictions"
output_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes"
gene_dict = mapping.groupby('chromosome')['gene'].apply(list).to_dict()
founders = ['ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [4]:
def concatenate_predictions(N, completed_genes):
    for gene in gene_dict[N]: 
        if gene in completed_genes:
            continue  
        interval = mapping[mapping["gene"] == gene]['interval'].item()
        predictions = {"human": [], "mouse": []}
        for individual in founders:
            input_path = os.path.join(predictions_dir, individual, "haplotype0",f'{interval}_predictions.h5')
            with h5py.File(input_path, "r") as input:
                predictions["human"].append(input["human"][:])
                predictions["mouse"].append(input["mouse"][:])       
        with h5py.File(os.path.join(output_dir, "human", f"chr{N}_genes.h5"), "a") as output:
            output[gene] = np.array(predictions["human"])
        with h5py.File(os.path.join(output_dir, "mouse", f"chr{N}_genes.h5"), "a") as output:
            output[gene] = np.array(predictions["mouse"])
        completed_genes.append(gene)

Writing to a single file case.

In [4]:
def concatenate_model(N, model):
    with h5py.File(os.path.join(output_dir, model, f"chr{N}_genes.h5"), "a") as output:  
        for gene in gene_dict[N]: 
            if gene in output.keys():
                continue  
            interval = mapping[mapping["gene"] == gene]['interval'].item()
            predictions = []
            for individual in founders:
                input_path = os.path.join(predictions_dir, individual, "haplotype0",f'{interval}_predictions.h5')
                with h5py.File(input_path, "r") as input:
                    predictions.append(input[model][:])
            output[gene] = np.array(predictions)    


In [6]:
with h5py.File(os.path.join(output_dir, "human", f"chr20_genes.h5"), "r") as output:  
    completed_genes = list(output.keys())

In [8]:
for N in range(1,21):
    print("chromosome", N)
    concatenate_model(str(N), "mouse")

chromosome 1
chromosome 2
chromosome 3
chromosome 4
chromosome 5
chromosome 6
chromosome 7
chromosome 8
chromosome 9
chromosome 10
chromosome 11
chromosome 12
chromosome 13
chromosome 14
chromosome 15
chromosome 16
chromosome 17
chromosome 18
chromosome 19
chromosome 20
