---
title: Comparing Uniform Haplotype Enformer gene expression prediction performance 
author: Sabrina Mi
date: 1/24/2024
---

## Setup

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
import h5py
import os
import time
columns = ['POS', 'ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [11]:
pred_expr = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_human_predicted_expression.csv", index_col=0)
individuals = list(pred_expr.index)
gene_annot = pd.read_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt")
genes_df = gene_annot[gene_annot['gene'].isin(pred_expr.columns)]

In [1]:
probabilities_dir = "/home/s1mi/Br_genotype_probabilities"
reference_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/human"
output_dir = "/eagle/AIHPC4Edu/sabrina/Br_prediction_from_founders_v2"
run_dir = "/home/s1mi/Github/deep-learning-in-genomics/posts/2024-01-10-epigenomexcan-vs-enformer-significant-genes"

In [21]:
def compute_epigenome(ref_matrix, start, end, prob):
    pr = prob.drop(columns=['POS']).mean(axis=0)
    pr_tensor = tf.cast(pr, dtype=tf.float16)
    ref_tensor = tf.cast(ref_matrix, dtype=tf.float16)
    output = tf.tensordot(pr_tensor, ref_tensor, axes=1)
    return output
def run_sample_predictions(prob, reference_file, gene_annot):
    predictions = []
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        with h5py.File(reference_file, "r") as hf:
            ref_matrix = hf[gene][:]
        output = compute_epigenome(ref_matrix, start, end, prob)
        predictions.append(np.average(output[446:450, 4980]))
    return predictions

### Compute Haplotype-Combination Gene Expression

In [22]:
for chr, group in genes_df.groupby("chromosome"):
    start_time = time.perf_counter()
    reference_file = os.path.join(reference_dir, f"{chr}_genes.h5")
    probabilities_file = os.path.join(probabilities_dir, f"{chr}_probabilities.h5")
    output_file = os.path.join(output_dir, f"{chr}_selected_genes.csv")
    if os.path.exists(output_file):
        haplo_expr = pd.read_csv(output_file, index_col=0)
        completed_individuals = haplo_expr.dropna().index
        individuals = pred_expr.index.difference(completed_individuals)
    else:
        haplo_expr = pd.DataFrame(columns=group['gene'], index=pred_expr.index)
        individuals = haplo_expr.index
    with h5py.File(os.path.join(probabilities_dir, f"{chr}_probabilities.h5"), "r") as input:
        for individual in individuals:
            prob = pd.DataFrame(input[individual][:], columns=columns)
            predictions = run_sample_predictions(prob, reference_file, group)
            haplo_expr.loc[individual] = predictions
    end_time = time.perf_counter()
    print(len(group), (start_time - end_time) / 60, "minutes")
    
#    
    haplo_expr.to_csv(output_file)