---
title: "Scaling up EpigenomeXcan: execution time, parallelization"
author: Sabrina Mi
date: 1/3/2024
---

**Takeaways**
 
* GPU prediction is slightly faster on test run (3000 predictions), we need to use tensorflow for larger experiments across all 340 individuals
* GPU memory is finicky, I had to pare down tensors to float16 and clear memory storing epigenome matrices. In practice, I'll have to write outputs more frequently. 

## Setup

In [2]:
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import h5py
import os
import time
columns = ['POS', 'ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [3]:
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/Br_expressed_genes.txt", "r") as f:
    gene_list = f.read().splitlines()
with open("/home/s1mi/Github/deep-learning-in-genomics/posts/2023-11-07-HS-founder-epigenomes/metadata/intervals.txt", "r") as f:
    intervals = f.read().splitlines()
mapping = pd.DataFrame({"gene": gene_list, "interval": intervals})
split_intervals = mapping['interval'].str.split('_')
mapping['chromosome'] = split_intervals.str[0]
mapping['tss'] = split_intervals.str[1]
mapping.drop(columns=['interval']).to_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt", index=False)


In [4]:
probabilities_dir = "/home/s1mi/Br_genotype_probabilities"
reference_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/human"
project_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/Br_epigenomes"
with open("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/Br_samples.txt", "r") as f:
    individuals = f.read().splitlines()
mapping = pd.read_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt")
genes_df = mapping[mapping['chromosome'] == 'chr1']


In [4]:
def write_h5(path, predictions):
    with h5py.File(path, "w") as hf:
        for key, value in predictions.items():
            hf[key] = value

## Numpy Test

In [5]:
def compute_epigenome(chr, gene, start, end, prob):
    bins = np.linspace(start, end, 896)
    interp_prob = []
    for column in columns[1:]: 
        interp_prob.append(np.interp(bins, prob['POS'], prob[column]))
    pr_matrix = np.transpose(np.array(interp_prob))
    with h5py.File(os.path.join(reference_dir, f"{chr}_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
    

def run_predictions(chr, prob, gene_annot):
    predictions = {}
    # interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        # if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
        output = compute_epigenome(chr, gene, start, end, prob)
        # Process genotype probabilities
        predictions[gene] = output
        if ((index + 1) % 200 == 0):
            end_time = time.perf_counter()
            print(index + 1, "iterations:", end_time - start_time, "seconds")

    return predictions

In [11]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
        output_dir = os.path.join(project_dir, '0007899884')
        if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
        prob = pd.DataFrame(input['0007899884'][:], columns=columns)
        predictions = run_predictions('chr1', prob, genes_df)
        write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 
# print(f"{chr}:", len(group), "genes...", (end_time - start_time) // 60, "minutes")

200 iterations: 43.178196704000584 seconds
400 iterations: 85.30787616000089 seconds
600 iterations: 127.37160777700046 seconds
800 iterations: 170.39981612300107 seconds
1000 iterations: 214.97213824600112 seconds
1200 iterations: 258.91626574399925 seconds
1400 iterations: 302.4928128120009 seconds
1600 iterations: 346.4003939719987 seconds
1800 iterations: 389.48382942399985 seconds
2000 iterations: 433.0845548489997 seconds
2200 iterations: 476.6690443709995 seconds
2400 iterations: 546.7537841170015 seconds
2600 iterations: 592.0861903549994 seconds
2800 iterations: 658.7331474230014 seconds
3000 iterations: 718.2645332190004 seconds


## Scipy Test 

Using scipy's interp1d function to interpolate all 8 columns simultaneously. Generating the interpolation function costs about 20 seconds of runtime

In [6]:
def compute_epigenome(gene, start, end, interp):
    bins = np.linspace(start, end, 896)
    pr_matrix = interp(bins)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    interp = interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, interp)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
    return predictions

In [18]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    prob = pd.DataFrame(input['000789972A'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)
    # write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


200 iterations: 55.26515249814838 seconds
400 iterations: 105.58576967194676 seconds
600 iterations: 136.04839484021068 seconds
800 iterations: 193.6581759629771 seconds
1000 iterations: 254.47129005938768 seconds
1200 iterations: 305.2362718908116 seconds
1400 iterations: 355.68491036072373 seconds
1600 iterations: 417.03871417138726 seconds
1800 iterations: 469.593163177371 seconds
2000 iterations: 522.8375322008505 seconds
2200 iterations: 576.9442004561424 seconds
2400 iterations: 629.3138183737174 seconds
2600 iterations: 687.7609961153939 seconds
2800 iterations: 739.739781155251 seconds
3000 iterations: 797.2053201990202 seconds


## Tensorflow Test (GPU)

Neither tensorflow or torch libraries offer interpolation for the purpose I need, so I wrote my interpolation function in order to keep all of the computational steps as tensorflow operations.

In [6]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

2024-01-18 00:52:06.613129: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


In [7]:
def interpolate(x, xi, y):
    j   =   tf.argsort(tf.concat((x, xi), axis=-1))
    k   =   tf.range(len(j))
    q   =   tf.scatter_nd(j[:, tf.newaxis], k, k.shape)

    lxi =   len(xi)

    r   =   q[-lxi:]-tf.range(0, lxi)
    r   =   tf.where(xi == x[-1], q[-1:] - lxi, r)

    x2  =   tf.gather(x, r)
    x1  =   tf.gather(x, r-1)
    y2  =   tf.gather(y, r)
    y1  =   tf.gather(y, r-1)

    u   =   (xi-x1)/(x2-x1)
    if not tf.rank(u) == tf.rank(y1):
        u   =   tf.expand_dims(u, axis=-1)

    yi  =   (1.0-u)*y1 + u*y2

    return tf.where(y1 == y2, y1, yi)

In [9]:
def compute_epigenome(gene, start, end, prob):
    x = tf.constant(prob['POS'].values)
    y = tf.constant(prob.drop(columns=['POS']).values)
    bins = tf.linspace(start, end, 896)
    pr_tensor = tf.expand_dims(tf.cast(interpolate(x, bins, y), dtype=tf.float32), axis=1)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_tensor = tf.transpose(matrix, perm=[1, 0, 2])
    output = tf.squeeze(tf.matmul(pr_tensor, ref_tensor), axis=1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, prob)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
    return predictions

In [10]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    output_dir = os.path.join(project_dir, '0007899884')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    prob = pd.DataFrame(input['000789FF64'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)
    # write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 


2024-01-18 00:35:52.561833: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38167 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:07:00.0, compute capability: 8.0
2024-01-18 00:35:52.563078: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38167 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:46:00.0, compute capability: 8.0
2024-01-18 00:35:52.564128: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38167 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:85:00.0, compute capability: 8.0
2024-01-18 00:35:52.565182: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 38167 MB memory:  -> device: 3, name: NVIDIA A100-SXM4-40GB, pci bu

200 iterations: 47.01733132800291 seconds
400 iterations: 88.90143480299957 seconds
600 iterations: 129.76665228099955 seconds
800 iterations: 172.2306377370005 seconds
1000 iterations: 216.09265120500277 seconds
1200 iterations: 259.2315747570028 seconds
1400 iterations: 302.6479036160017 seconds
1600 iterations: 350.47071519699966 seconds
1800 iterations: 395.40081796199956 seconds
2000 iterations: 436.0582850840001 seconds


2024-01-18 00:43:34.340681: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 145.28MiB (rounded to 152334336)requested by op Transpose
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-01-18 00:43:34.340731: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-01-18 00:43:34.340742: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 7, Chunks in use: 7. 1.8KiB allocated for chunks. 1.8KiB in use in bin. 33B client-requested in use in bin.
2024-01-18 00:43:34.340749: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 1, Chunks in use: 0. 512B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-01-18 00:43:34.340755: I tensorflow/tsl/framework/bfc_allocator.cc:1

ResourceExhaustedError: {{function_node __wrapped__Transpose_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[896,8,5313] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Transpose]

In [8]:
def compute_epigenome(gene, start, end, prob):
    x = tf.constant(prob['POS'].values)
    y = tf.constant(prob.drop(columns=['POS']).values)
    bins = tf.linspace(start, end, 896)
    pr_tensor = tf.expand_dims(interpolate(x, bins, y), axis=1)
    with h5py.File(os.path.join(reference_dir, f"chr1_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_tensor = tf.transpose(matrix, perm=[1, 0, 2])
    pr_tensor = tf.cast(pr_tensor, dtype=tf.float16)
    ref_tensor = tf.cast(ref_tensor, dtype=tf.float16)
    output = tf.squeeze(tf.matmul(pr_tensor, ref_tensor), axis=1)
    return output
def run_predictions(prob, gene_annot):
    predictions = {}
    start_time = time.perf_counter()
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
            output = compute_epigenome(gene, start, end, prob)
            # Process genotype probabilities
            predictions[gene] = output
            end_time = time.perf_counter()
            if ((index + 1) % 200 == 0):
                print(index + 1, "iterations:", end_time - start_time, "seconds")
                if (index + 1) % 1000 == 0:
                    # free up memory
                    predictions = {}
    return predictions

In [9]:
with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
    output_dir = os.path.join(project_dir, '0007899884')
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    prob = pd.DataFrame(input['000789FF64'][:], columns=columns)
    predictions = run_predictions(prob, genes_df)

2024-01-18 00:52:23.838247: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38167 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:07:00.0, compute capability: 8.0
2024-01-18 00:52:23.839500: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38167 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:46:00.0, compute capability: 8.0
2024-01-18 00:52:23.840547: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38167 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:85:00.0, compute capability: 8.0
2024-01-18 00:52:23.841603: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 38167 MB memory:  -> device: 3, name: NVIDIA A100-SXM4-40GB, pci bu

200 iterations: 47.26202517999991 seconds
400 iterations: 88.73585744500087 seconds
600 iterations: 132.8569780329999 seconds
800 iterations: 180.7470046210001 seconds
1000 iterations: 228.1653065870014 seconds
1200 iterations: 271.74169106200134 seconds
1400 iterations: 318.2701013930018 seconds
1600 iterations: 365.416816363002 seconds
1800 iterations: 408.23091432 seconds
2000 iterations: 456.0066568750008 seconds
2200 iterations: 500.70750081000006 seconds
2400 iterations: 544.8922489830002 seconds
2600 iterations: 592.1257908199987 seconds
2800 iterations: 637.4825293079994 seconds
3000 iterations: 684.2177478490012 seconds
