---
title: Comparing EpigenomeXcan and Enformer gene expression prediction performance 
author: Sabrina Mi
date: 1/10/2024
---

In [18]:
import pandas as pd
import numpy as np
import h5py
import os
import time
columns = ['POS', 'ACI', 'BN', 'BUF', 'F344', 'M520', 'MR', 'WKY', 'WN']

In [16]:
pred_expr = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_human_predicted_expression.csv", index_col=0)
obs_expr = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_observed_expression.csv", index_col=0)
gene_list = list(pred_expr.columns)
gene_annot = pd.read_csv("/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/gene_mapping.txt")
genes_df = gene_annot[gene_annot['gene'].isin(gene_list)]

In [24]:
probabilities_dir = "/home/s1mi/Br_genotype_probabilities"
reference_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/HS_founder_epigenomes/human"
# project_dir = "/eagle/AIHPC4Edu/sabrina/Br_predictions/Br_epigenomes"

In [28]:
def compute_epigenome(chr, gene, start, end, prob):
    bins = np.linspace(start, end, 896)
    scaled_prob = []
    for column in columns[1:]: 
        scaled_prob.append(np.interp(bins, prob['POS'], prob[column]))
    pr_matrix = np.transpose(np.array(scaled_prob))
    with h5py.File(os.path.join(reference_dir, f"{chr}_genes.h5"), "r") as hf:
        matrix = hf[gene][:]
    ref_matrix = np.transpose(matrix, axes=(1, 0, 2))
    output = np.squeeze(pr_matrix[:, np.newaxis,:] @ ref_matrix, 1)
    return output
    

def run_predictions(chr, prob, gene_annot):
    predictions = {}
    # interp = interpolate.interp1d(np.array(prob['POS']), np.array(prob.drop(columns=['POS'])), axis = 0)
    for index, row in gene_annot.iterrows():
        gene = row['gene']
        tss = row['tss']
        start = tss - 57344
        end = tss + 57344
        # if (start >= prob.iloc[0,0] and end <= prob.iloc[-1,0]):
        output = compute_epigenome(chr, gene, start, end, prob)
        # Process genotype probabilities
        predictions[gene] = output

    return predictions

In [29]:
haplo_expr = pd.DataFrame(columns=pred_expr.columns, index=pred_expr.index)
for chr, group in genes_df.groupby("chromosome"):
    with h5py.File(os.path.join(probabilities_dir, f"{chr}_probabilities.h5"), "r") as input:
        for individual in pred_expr.index:
            prob = pd.DataFrame(input[individual][:], columns=columns)
            predictions = run_predictions('chr1', prob, genes_df)
            print(predictions)
            break
    break

KeyError: "Unable to open object (object 'ENSRNOG00000016580' doesn't exist)"

In [None]:




with h5py.File(os.path.join(probabilities_dir, f"chr1_probabilities.h5"), "r") as input:
        output_dir = os.path.join(project_dir, '000789972A')
        # if not os.path.isdir(output_dir):
        #     os.makedirs(output_dir)
        prob = pd.DataFrame(input['000789972A'][:], columns=columns)
        predictions = run_predictions('chr1', prob, genes_df)
        write_h5(os.path.join(output_dir, f"{chr}_genes.h5"), predictions) 
# print(f"{chr}:", len(group), "genes...", (end_time - start_time) // 60, "minutes")