---
title: "Scaling up EpigenomeXcan: execution time, parallelization"
author: Sabrina Mi
date: 1/3/2024
---


## Setup

In [None]:
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import h5py
import os
import time
columns = ['POS', 'ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [2]:
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/Br_expressed_genes.txt", "r") as f:
    gene_list = f.read().splitlines()
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/metadata/intervals.txt", "r") as f:
    intervals = f.read().splitlines()
mapping = pd.DataFrame({"gene": gene_list, "interval": intervals})
split_intervals = mapping['interval'].str.split('_')
mapping['chromosome'] = split_intervals.str[0]
mapping['tss'] = split_intervals.str[1]
mapping.drop(columns=['interval']).to_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt", index=False)


In [3]:
probabilities_dir = "/home/s1mi/Br_genotype_probabilities"
reference_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/human"
project_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/Br_epigenomes"
with open("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/Br_samples.txt", "r") as f:
    individuals = f.read().splitlines()
mapping = pd.read_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt")
genes_df = mapping[mapping['chromosome'] == 'chr1']


In [4]:
def write_h5(path, predictions):
    with h5py.File(path, "w") as hf:
        for key, value in predictions.items():
            hf[key] = value

## Numpy Test

In [5]:
def compute_epigenome(chr, gene, start, end, prob):
    bins = np.linspace(start, end, 896)
    interp_prob = []
    for column in columns[1:]: 
        interp_prob.append(np.interp(bins, prob['POS'], prob[column]))
    pr_matrix = np.transpose(np.array(interp_prob))
    with h5py.File(os.path.join(reference_dir, f"{chr}_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
    

def run_predictions(chr, prob, gene_annot):
    predictions = {}
    # interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        # if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
        output = compute_epigenome(chr, gene, start, end, prob)
        # Process genotype probabilities
        predictions[gene] = output
        if ((index + 1) % 200 == 0):
            end_time = time.perf_counter()
            print(index + 1, "iterations:", end_time - start_time, "seconds")

    return predictions

In [11]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
        output_dir = os.path.join(project_dir, '0007899884')
        if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
        prob = pd.DataFrame(input['0007899884'][:], columns=columns)
        predictions = run_predictions('chr1', prob, genes_df)
        write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 
# print(f"{chr}:", len(group), "genes...", (end_time - start_time) // 60, "minutes")

200 iterations: 43.178196704000584 seconds
400 iterations: 85.30787616000089 seconds
600 iterations: 127.37160777700046 seconds
800 iterations: 170.39981612300107 seconds
1000 iterations: 214.97213824600112 seconds
1200 iterations: 258.91626574399925 seconds
1400 iterations: 302.4928128120009 seconds
1600 iterations: 346.4003939719987 seconds
1800 iterations: 389.48382942399985 seconds
2000 iterations: 433.0845548489997 seconds
2200 iterations: 476.6690443709995 seconds
2400 iterations: 546.7537841170015 seconds
2600 iterations: 592.0861903549994 seconds
2800 iterations: 658.7331474230014 seconds
3000 iterations: 718.2645332190004 seconds


## Scipy Test 

Using scipy's interp1d function to interpolate all 8 columns simultaneously. Generating the interpolation function costs about 20 seconds of runtime

In [6]:
def compute_epigenome(gene, start, end, interp):
    bins = np.linspace(start, end, 896)
    pr_matrix = interp(bins)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    interp = interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, interp)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
    return predictions

In [18]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    prob = pd.DataFrame(input['000789972A'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)
    # write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


200 iterations: 55.26515249814838 seconds
400 iterations: 105.58576967194676 seconds
600 iterations: 136.04839484021068 seconds
800 iterations: 193.6581759629771 seconds
1000 iterations: 254.47129005938768 seconds
1200 iterations: 305.2362718908116 seconds
1400 iterations: 355.68491036072373 seconds
1600 iterations: 417.03871417138726 seconds
1800 iterations: 469.593163177371 seconds
2000 iterations: 522.8375322008505 seconds
2200 iterations: 576.9442004561424 seconds
2400 iterations: 629.3138183737174 seconds
2600 iterations: 687.7609961153939 seconds
2800 iterations: 739.739781155251 seconds
3000 iterations: 797.2053201990202 seconds


## Tensorflow Test (GPU)

Neither tensorflow or torch libraries offer interpolation for the purpose I need, so I wrote my interpolation function in order to keep all of the computational steps as tensorflow operations.

In [5]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

2024-01-10 21:51:15.886087: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


In [8]:
def interpolate(x, xi, y):
    j   =   tf.argsort(tf.concat((x, xi), axis=-1))
    k   =   tf.range(len(j))
    q   =   tf.scatter_nd(j[:, tf.newaxis], k, k.shape)

    lxi =   len(xi)

    r   =   q[-lxi:]-tf.range(0, lxi)
    r   =   tf.where(xi == x[-1], q[-1:] - lxi, r)

    x2  =   tf.gather(x, r)
    x1  =   tf.gather(x, r-1)
    y2  =   tf.gather(y, r)
    y1  =   tf.gather(y, r-1)

    u   =   (xi-x1)/(x2-x1)
    if not tf.rank(u) == tf.rank(y1):
        u   =   tf.expand_dims(u, axis=-1)

    yi  =   (1.0-u)*y1 + u*y2

    return tf.where(y1 == y2, y1, yi)

In [7]:
def compute_epigenome(gene, start, end, prob):
    x = tf.constant(prob['POS'].values)
    y = tf.constant(prob.drop(columns=['POS']).values)
    bins = tf.linspace(start, end, 896)
    pr_tensor = tf.expand_dims(tf.cast(interpolate(x, bins, y), dtype=tf.float32), axis=1)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_tensor = tf.transpose(matrix, perm=[1, 0, 2])
    output = tf.squeeze(tf.matmul(pr_tensor, ref_tensor), axis=1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, prob)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
    return predictions

In [9]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    output_dir = os.path.join(project_dir, '0007899884')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    prob = pd.DataFrame(input['000789FF64'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)
    write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


2024-01-10 21:51:44.193582: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38167 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:07:00.0, compute capability: 8.0
2024-01-10 21:51:44.197736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 37815 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:46:00.0, compute capability: 8.0
2024-01-10 21:51:44.204366: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 37755 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:85:00.0, compute capability: 8.0
2024-01-10 21:51:44.205973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 37755 MB memory:  -> device: 3, name: NVIDIA A100-SXM4-40GB, pci bu

In [25]:
def compute_epigenome(gene, start, end, interp):
    bins = np.linspace(start, end, 896)
    pr_matrix = interp(bins)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output

with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    prob = pd.DataFrame(input['000789FF64'][:], columns=columns)
    interp = interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in genes_df.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, interp)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
        break
    print(output)


[[0.08767378 0.09323303 0.16627961 ... 0.00949302 0.03075832 0.03259552]
 [0.08988153 0.09385183 0.13803922 ... 0.00846795 0.02565644 0.02787809]
 [0.0828562  0.08802018 0.08945152 ... 0.00512026 0.01920758 0.01718544]
 ...
 [0.08135339 0.08957597 0.0973008  ... 0.00999105 0.03571409 0.02636748]
 [0.09193078 0.0944492  0.10272115 ... 0.00973209 0.03738421 0.02245922]
 [0.09127819 0.08846645 0.08769749 ... 0.01205442 0.0357308  0.02092708]]
