---
title: "Scaling up EpigenomeXcan: execution time, parallelization"
author: Sabrina Mi
date: 1/3/2024
---


## Numpy Test

In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate
import h5py
import os
import time
columns = ['POS', 'ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [3]:
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/Br_expressed_genes.txt", "r") as f:
    gene_list = f.read().splitlines()
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/metadata/intervals.txt", "r") as f:
    intervals = f.read().splitlines()
mapping = pd.DataFrame({"gene": gene_list, "interval": intervals})
split_intervals = mapping['interval'].str.split('_')
mapping['chromosome'] = split_intervals.str[0]
mapping['tss'] = split_intervals.str[1]
mapping.drop(columns=['interval']).to_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt", index=False)


In [4]:
probabilities_dir = "/home/s1mi/Br_genotype_probabilities"
reference_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/human"
project_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/Br_epigenomes"
with open("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/Br_samples.txt", "r") as f:
    individuals = f.read().splitlines()
mapping = pd.read_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt")


In [9]:
def write_h5(path, predictions):
    with h5py.File(path, "w") as hf:
        for key, value in predictions.items():
            hf[key] = value

In [6]:
def compute_epigenome(chr, gene, start, end, prob):
    bins = np.linspace(start, end, 896)
    scaled_prob = []
    for column in columns[1:]: 
        scaled_prob.append(np.interp(bins, prob['POS'], prob[column]))
    pr_matrix = np.transpose(np.array(scaled_prob))
    with h5py.File(os.path.join(reference_dir, f"{chr}_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
    

def run_predictions(chr, prob, gene_annot):
    predictions = {}
    # interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        # if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
        output = compute_epigenome(chr, gene, start, end, prob)
        # Process genotype probabilities
        predictions[gene] = output
        if ((index + 1) % 100 == 0):
            end_time = time.perf_counter()
            print(index + 1, "iterations:", end_time - start_time, "seconds")

    return predictions

In [7]:
genes_df = mapping[mapping['chromosome'] == 'chr1']
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
        output_dir = os.path.join(project_dir, '000789972A')
        # if not os.path.isdir(output_dir):
        #     os.makedirs(output_dir)
        prob = pd.DataFrame(input['000789972A'][:], columns=columns)
        predictions = run_predictions('chr1', prob, genes_df)
        write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 
# print(f"{chr}:", len(group), "genes...", (end_time - start_time) // 60, "minutes")

100 iterations: 30.23924341239035 seconds
200 iterations: 58.741962409578264 seconds
300 iterations: 86.70877797622234 seconds
400 iterations: 114.42556379362941 seconds
500 iterations: 142.98598795291036 seconds
600 iterations: 171.59385333769023 seconds
700 iterations: 199.60792813822627 seconds
800 iterations: 227.5820687506348 seconds
900 iterations: 254.93660536315292 seconds
1000 iterations: 282.0493865776807 seconds
1100 iterations: 309.8969806600362 seconds
1200 iterations: 337.443421902135 seconds
1300 iterations: 364.4411320472136 seconds
1400 iterations: 391.4582523852587 seconds
1500 iterations: 418.4635310182348 seconds
1600 iterations: 467.45072995033115 seconds
1700 iterations: 504.66587734408677 seconds
1800 iterations: 541.083711117506 seconds
1900 iterations: 574.226130806841 seconds
2000 iterations: 608.9806590257213 seconds
2100 iterations: 645.2934711230919 seconds
2200 iterations: 675.4267345676199 seconds
2300 iterations: 704.1126300338656 seconds
2400 iterations

In [10]:
output_dir = os.path.join(project_dir, '000789972A')
write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


## Scipy Test

Using scipy's interp1d function to interpolate all 8 columns simultaneously. Generating the interpolation function costs about 20 seconds of runtime

In [17]:
def compute_epigenome(gene, start, end, interp):
    bins = np.linspace(start, end, 896)
    pr_matrix = interp(bins)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, interp)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
    return predictions

In [18]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    prob = pd.DataFrame(input['000789972A'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)
    # write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


200 iterations: 55.26515249814838 seconds
400 iterations: 105.58576967194676 seconds
600 iterations: 136.04839484021068 seconds
800 iterations: 193.6581759629771 seconds
1000 iterations: 254.47129005938768 seconds
1200 iterations: 305.2362718908116 seconds
1400 iterations: 355.68491036072373 seconds
1600 iterations: 417.03871417138726 seconds
1800 iterations: 469.593163177371 seconds
2000 iterations: 522.8375322008505 seconds
2200 iterations: 576.9442004561424 seconds
2400 iterations: 629.3138183737174 seconds
2600 iterations: 687.7609961153939 seconds
2800 iterations: 739.739781155251 seconds
3000 iterations: 797.2053201990202 seconds


## Tensorflow Test

Neither tensorflow or torch libraries offer interpolation for the purpose I need, the only step to incorporate GPU computation is in the final matrix multiplication. However, transferring matrix from CPU to GPU memory adds overhead, so we need to test if using GPU for matrix multiplication is fast enough.

In [None]:
import tensorflow as tf

In [18]:
def compute_epigenome(gene, start, end, prob):
    bins = tf.linspace(start, end, 896)
    scaled_prob = []
    for column in columns[1:]: 
        scaled_prob.append(np.interp(bins, prob['POS'], prob[column]))
    pr_matrix = np.transpose(np.array(scaled_prob))
    with h5py.File(os.path.join(reference_dir, f"{chr}_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
    

def run_predictions(prob, gene_annot):
    predictions = {}
    # interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    for _, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        # if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
        output = compute_epigenome(gene, start, end, prob)
        # Process genotype probabilities
        predictions[gene] = output

    return predictions