---
title: Personalized Prediction Correlations for  868 rn7 genes
date: 9/1/23
author: Sabrina Mi
---

We ran the personalized Enformer pipeline 868 genes for high variation in Br rats and number of eQTLs. There are too many predictions to concatenate iteratively, so I wrote [collect_predictions.py](collect_predictions.py) to parallelize in a submitted job. 

Note: This script appends predictions to the h5 in parallel, I first initialized the h5 file with individuals list to be paired with all gene expression datasets.

In [None]:
import h5py
import pandas as pd
project_dir = "/home/s1mi/Br_predictions/predictions_folder/personalized_Br_selected_genes"

obs_gene_expr = pd.read_csv("/home/s1mi/enformer_rat_data/expression_data/Brain.rn7.expr.tpm.bed", sep="\t", nrows=1)
with h5py.File(f"{project_dir}/selected_genes_mouse_and_human_predictions.h5", "w") as file:
    file.attrs["index"] = obs_gene_expr.columns.to_list()[4:]

After my parsl job was stuck in the queue for two days, I decided to forgo parsl and append the predictions from the login node. This took many interrupted attempts, but was sadly the better option. The non-parallelized code is below:

In [7]:
#### LIBRARIES AND DEFINITIONS
import pandas as pd
import numpy as np
import h5py
project_dir = "/home/s1mi/Br_predictions/predictions_folder/personalized_Br_selected_genes"
predictions_dir = f"{project_dir}/predictions_2023-09-02/enformer_predictions"
obs_gene_expr = pd.read_csv("/home/s1mi/enformer_rat_data/expression_data/Brain.rn7.expr.tpm.bed", sep="\t", header=0, index_col='gene_id')
annot_df = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/rn7.gene.txt", sep="\t", header= 0, index_col='geneId')
#### GET LIST OF GENES
with open(f"gene_list.txt", "r") as file:
    gene_list = file.read().splitlines()
print(len(gene_list), "genes with enformer predictions")

#### JOIN WITH ENFORMER PREDICTIONS FUNCTION
def collect_predictions(gene):

    with h5py.File(f"{project_dir}/selected_genes_mouse_and_human_predictions.h5", "a") as output_file:
        if gene not in output_file.keys():
            ### INITIALIZE EXPRESSION MATRIX WITH OBSERVED DATA
            expr_df = pd.DataFrame({"observed": obs_gene_expr.loc[gene][3:].astype("float32")})    

            ### READ PREDICTIONS
            gene_annot = annot_df.loc[gene]
            interval = f"chr{gene_annot['chromosome']}_{gene_annot['tss']}_{gene_annot['tss']}"
            human_predicted = []
            mouse_predicted = []
            for individual in expr_df.index:
                with h5py.File(f"{predictions_dir}/{individual}/haplotype0/{interval}_predictions.h5", "r") as input_file:
                    human_prediction = input_file["human"][446:450, 4980]
                    mouse_prediction = input_file["mouse"][446:450, 1300]
                    human_predicted.append(np.average(human_prediction))
                    mouse_predicted.append(np.average(mouse_prediction))
            
            ### JOIN IN DATAFRAME
            expr_df["human predicted"] = human_predicted
            expr_df["mouse predicted"] = mouse_predicted
            ### WRITE TO h5
            output_file[gene] = expr_df


#### JOIN CONCURRENTLY ACROSS GENES
for gene in gene_list:
    collect_predictions(gene)
print("Finished writing expression matrices for", len(gene), "genes")

  obs_gene_expr = pd.read_csv("/home/s1mi/enformer_rat_data/expression_data/Brain.rn7.expr.tpm.bed", sep="\t", header=0, index_col='gene_id')
  annot_df = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/rn7.gene.txt", sep="\t", header= 0, index_col='geneId')


868 genes with enformer predictions


KeyboardInterrupt: 

## Read Processed Predictions

In [6]:
import h5py
import pandas as pd
project_dir = "/home/s1mi/Br_predictions/predictions_folder/personalized_Br_selected_genes"
expr_dict = {}
with h5py.File(f"{project_dir}/selected_genes_mouse_and_human_predictions.h5", "r") as file:
    index = file.attrs["index"]
    for gene in file.keys():
        matrix = file[gene][:]
        expr_dict[gene] = pd.DataFrame(matrix, index = index, columns = ["observed", "human predicted", "mouse predicted"])

In [None]:
corr_by_gene_and_track = pd.DataFrame(columns = ["human", "mouse"], index = gene_list)
for gene in gene_list:
    corr_df = expr_dict[gene].corr()
    corr_by_gene_and_track.loc[gene] = pd.to_numeric(corr_df.iloc[0,1])