---
title: Personalized Borzoi test on a few genes
date: 10/4/2023
author: Sabrina Mi
---

### Select Genes

We want to pick a handful of human genes (with rat orthologs) used in the Enformer personalized runs (on rats) that will roughly have a similar distribution of Spearman correlations.

In [8]:
import pandas as pd
import numpy as np

In [43]:
rn7_gene_list = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_personalized_spearman_corr_human.csv", index_col = 0)

In [33]:
#| code-fold: true
# Calculate mean and standard deviation
mean = np.mean(gene_list['spearman r'])
std_dev = np.std(gene_list['spearman r'])


# Group the elements based on their distance from the mean
df_1 = pd.DataFrame(columns=['gene', 'spearman r'])
df_2 = pd.DataFrame(columns=['gene', 'spearman r'])
df_3 = pd.DataFrame(columns=['gene', 'spearman r'])

for gene, row in rn7_gene_list.iterrows():
    deviation = abs(row['spearman r'] - mean)
    df_row = pd.DataFrame({'gene': [gene], 'spearman r': row['spearman r']})
    if deviation <= std_dev:
        df_1 = pd.concat([df_1, df_row], ignore_index=True)
    elif deviation <= 2 * std_dev:
        df_2 = pd.concat([df_2, df_row], ignore_index=True)
    else:
        df_3 = pd.concat([df_3, df_row], ignore_index=True)


In [47]:
rn7_hg38_ortho = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/rn7_hg38.ortholog_genes.txt", sep="\t", index_col="ensembl_gene_id")
hg38_annot = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/hg38.gene.txt", sep="\t")
ortho_genes = list((rn7_gene_list.index).intersection(rn7_hg38_ortho.index))

In [50]:
# select human gene from each standard deviation grouping
df_1 = df_1[df_1['gene'].isin(ortho_genes)]
df_2 = df_2[df_2['gene'].isin(ortho_genes)]
df_3 = df_3[df_3['gene'].isin(ortho_genes)]
test_genes = [df_1['gene'].sample().item(), df_2['gene'].sample().item(), df_3['gene'].sample().item()]

In [81]:
hg38_gene_list = rn7_hg38_ortho['hsapiens_homolog_ensembl_gene'].loc[test_genes].to_list()
hg38_gene_df = hg38_annot[hg38_annot['ensembl_gene_id'].isin(hg38_gene_list)]
hg38_gene_df = hg38_gene_df[["ensembl_gene_id", "chromosome_name", "transcript_start", "transcript_end"]]

In [86]:
hg38_gene_df.to_csv("gene_list.csv", index=False)

### Write Individuals List

There are 455 individuals in the GEUVADIS data with LCL gene expression data.

In [None]:
import cyvcf2
vcf_chr = cyvcf2.cyvcf2.VCF("/grand/TFXcan/imlab/data/1000G/vcf_snps_only/ALL.chr1.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz")
vcf_samples = vcf_chr.samples

In [9]:
geuvadis_gex = pd.read_csv("/lus/grand/projects/TFXcan/imlab/data/1000G/expression/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz", sep="\t")
individuals = geuvadis_gex.columns[4:].tolist()
samples = list(set(vcf_samples).intersection(individuals))
with open("individuals.txt", "w") as f:
    f.write("\n".join(samples))

### Run Predictions

I started a pipeline for personalized prediction in this [notebook](https://sabrina-dl.hakyimlab.org/posts/2023-09-26-borzoi-personalized-test/geuvadis_personalized_test), and put it into a [python script](personalized_prediction.py).

I submitted this as a [PBS job](borzoi_test_run.pbs), `qsub borzoi_test_run.pbs`.

```
module load conda
conda activate borzoi
cd /home/s1mi/Github/deep-learning-in-genomics/posts/2023-10-04-personalized-test-on-a-few-genes

export LD_LIBRARY_PATH=/soft/compilers/cudatoolkit/cuda-11.8.0/extras/CUPTI/lib64:/soft/compilers/cudatoolkit/cuda-11.8.0/lib64:/soft/libraries/trt/TensorRT-8.5.2.2.Linux.x86_64-gnu.cuda-11.8.cudnn8.6/lib:/soft/libraries/nccl/nccl_2.16.2-1+cuda11.8_x86_64/lib:/soft/libraries/cudnn/cudnn-11-linux-x64-v8.6.0.163/lib:$LD_LIBRARY_PATH

echo $LD_LIBRARY_PATH

python3 personalized_prediction.py \
--gene_df gene_list.csv \
--fasta_file /home/s1mi/borzoi_tutorial/hg38.fa \
--vcf_dir /grand/TFXcan/imlab/data/1000G/vcf_snps_only \
--individuals_file individuals.txt \
--model_dir /home/s1mi/borzoi_tutorial \
--output_dir /grand/TFXcan/imlab/users/sabrina/borzoi-personalized-test

```

### Check Results

Expand the code below for setting global variables and reading GEUVADIS data.

In [1]:
#| code-fold: true
#### LIBRARIES AND DEFINITIONS
import pandas as pd
import numpy as np
import h5py
import os
predictions_dir = f"/grand/TFXcan/imlab/users/sabrina/borzoi-personalized-test"
geuvadis_gex = pd.read_csv("/lus/grand/projects/TFXcan/imlab/data/1000G/expression/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz", sep="\t")
geuvadis_gex['TargetID'] = geuvadis_gex['TargetID'].apply(lambda gene: gene.split('.')[0])
geuvadis_gex.set_index('TargetID', inplace=True)
hg38_annot = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/hg38.gene.txt", sep="\t", index_col="ensembl_gene_id")
#### GET LIST OF GENES
genes = pd.read_csv("gene_list.csv")['ensembl_gene_id'].apply(lambda x: str(x)).to_list()
#### GET LIST OF INDIVIDUALS
with open("individuals.txt", "r") as file:
    individuals = file.read().splitlines()

from borzoi_helpers import *
from baskerville import gene as bgene
prefix="/home/s1mi/borzoi_tutorial"
splice_df = pd.read_csv(f'{prefix}/gencode41_basic_protein_splice.csv.gz', sep='\t', compression='gzip')
transcriptome = bgene.Transcriptome(f'{prefix}/gencode41_basic_nort.gtf')

targets_df = pd.read_csv(f'{prefix}/targets_human.txt', index_col=0, sep='\t')
target_index = targets_df.index


#Create local index of strand_pair (relative to sliced targets)
strand_pair = targets_df.strand_pair

target_slice_dict = {ix : i for i, ix in enumerate(target_index.values.tolist())}
slice_pair = np.array([
    target_slice_dict[ix] if ix in target_slice_dict else ix for ix in strand_pair.values.tolist()
], dtype='int32')

2023-10-12 03:16:08.150141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-12 03:16:16.242235: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2023-10-12 03:16:16.242438: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: polaris-login-02
2023-10-12 03:16:16.242469: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: polaris-login-02
2023-10-12 03:16:16.243018: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 470.103.4
2023-10-12 03:16:16.243100: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnosti

We compare gene expression predictions derived from borzoi's CAGE and RNA tracks. The GEUVADIS dataset includes RNA-seq data collected from LCL samples. Borzoi was trained on 
ENCODE's CAGE:B lymphoblastoid cell line data (tracks 870, 871) as well as GTEx RNA-seq data from EBV-transformed lymphosites, however, it's unclear to me which track this fell under. My guess is that track 7531:GTEX-1I4MK-0002-SM-EZ6M9 is the best approximation for LCL tissue.

In [3]:
from borzoi_calculation_helpers import *
def extract_RNA(ensembl_gene_id, seq_out_start, predictions, tracks, seq_out_len = 523264):
    gene_keys = [gene_key for gene_key in transcriptome.genes.keys() if ensembl_gene_id in gene_key]
    gene = transcriptome.genes[gene_keys[0]]
    gene_slice = gene.output_slice(seq_out_start, seq_out_len, 32, False)
    return expr_attr(predictions, tracks, gene_slice)

In [4]:
def extract_CAGE(predictions, tss_bin, tracks):
    return np.mean(predictions[:, tss_bin-1:tss_bin+2, tracks])

In [5]:
seq_len = 524288
seq_out_len = 523264
expr_dict = {}
for gene in genes:
    print(gene)
    chr = hg38_annot.loc[gene, 'chromosome_name']
    interval_start = hg38_annot.loc[gene, 'transcript_start']
    interval_end = hg38_annot.loc[gene, 'transcript_end']
    tss = hg38_annot.loc[gene, 'transcription_start_site']
    center = (interval_start + interval_end) // 2
    start = center - seq_len // 2
    end = center + seq_len // 2
    seq_out_start = start + 512
    tss_bin = (tss - seq_out_start) // 32
    CAGE_predictions = []
    RNA_predictions = []
    for individual in individuals:
        predictions_file = os.path.join(predictions_dir, individual, f'chr{chr}_{start}_{end}_predictions.h5')
        if os.path.exists(predictions_file):
            with h5py.File(predictions_file, "r") as hf:
                haplo1 = hf['haplotype1'][:]
                haplo2 = hf['haplotype2'][:]
            CAGE_mean = (extract_CAGE(haplo1, tss_bin, [870,871]) + extract_CAGE(haplo2, tss_bin, [870,871])) / 2
            RNA_mean = (extract_RNA(gene, seq_out_start, haplo1, [7531]) + extract_RNA(gene, seq_out_start, haplo2, [7531])) / 2
            CAGE_predictions.append(CAGE_mean)
            RNA_predictions.append(RNA_mean)
        else:
            CAGE_predictions.append(np.nan)
            RNA_predictions.append(np.nan)
    expr_dict[gene] = pd.DataFrame({'CAGE predicted': CAGE_predictions, 
                                    'RNA predicted': RNA_predictions}, index = individuals)
      

ENSG00000142949
ENSG00000133247
ENSG00000161011


In [34]:
with h5py.File('/home/s1mi/enformer_rat_data/output/borzoi_personalized_predictions_test.h5', "w") as hf:
    for key, value in expr_dict.items():
        hf[key] = value


In [44]:
with h5py.File('/home/s1mi/enformer_rat_data/output/borzoi_personalized_predictions_test.h5', 'r') as hf:
    for key, value in hf.items():
        expr_dict[key] = pd.DataFrame(value, index = individuals, columns = ['CAGE predicted', 'RNA predicted'])

In [45]:
for gene in genes:
    observed = pd.to_numeric(geuvadis_gex.loc[gene][individuals])
    observed.name = 'observed'
    expr_dict[gene] = expr_dict[gene].merge(observed, left_index = True, right_index = True)
    #expr_dict[gene] = expr_dict[gene].apply(pd.to_numeric)

#### Compare Predictions to Observed

To add context to each gene, we'll first pull correlations from Enformer personalized prediction on rat orthologs.


In [61]:
rn7_hg38_ortho = pd.read_csv("/home/s1mi/enformer_rat_data/annotation/rn7_hg38.ortholog_genes.txt", sep="\t", index_col="hsapiens_homolog_ensembl_gene")
rn7_genes = rn7_hg38_ortho.loc[genes]['ensembl_gene_id'].to_list()
rn7_spearman_corr = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_personalized_spearman_corr_human.csv", index_col = 0)
rn7_pearson_corr = pd.read_csv("/home/s1mi/enformer_rat_data/output/Br_personalized_pearson_corr_human.csv", index_col = 0)


**Br Rat Pearson Correlations**

In [63]:
rn7_pearson_corr.loc[rn7_genes]

Unnamed: 0,pearson r,pvalue
ENSRNOG00000019977,-0.24597,4.443961e-06
ENSRNOG00000017508,0.033957,0.5326182
ENSRNOG00000003147,0.624152,4.1014279999999995e-38


**Br Rat Spearman Correlations**

In [65]:
rn7_spearman_corr.loc[rn7_genes]

Unnamed: 0,spearman r,pvalue
ENSRNOG00000019977,-0.235618,1.130621e-05
ENSRNOG00000017508,0.012958,0.8118338
ENSRNOG00000003147,0.61493,9.560589e-37


**GEUVADIS Human Pearson Correlations**

ENSG00000142949 (Enformer Rat R=-0.246): Both CAGE and RNA tracks predict in the correct direction of gene effect, but with reduced magnitude

In [49]:
expr_dict[genes[0]].corr(method='pearson')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.460416,0.086982
RNA predicted,0.460416,1.0,0.073
observed,0.086982,0.073,1.0


ENSG00000133247 (Enformer Rat R = 0.034): Borzoi performs weakly as well.

In [50]:
expr_dict[genes[1]].corr(method='pearson')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.641555,-0.010348
RNA predicted,0.641555,1.0,0.024595
observed,-0.010348,0.024595,1.0


ENSG00000161011 (Enformer Rat R = 0.624): Borzoi underperforms in both tracks, I will need to debug.

In [69]:
expr_dict[genes[2]].corr(method='pearson')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.1887,0.021334
RNA predicted,0.1887,1.0,0.00348
observed,0.021334,0.00348,1.0


**GEUVADIS Human Spearman Correlations**

In [70]:
expr_dict[genes[0]].corr(method='spearman')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.526191,0.071541
RNA predicted,0.526191,1.0,0.085379
observed,0.071541,0.085379,1.0


In [71]:
expr_dict[genes[1]].corr(method='spearman')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.665929,0.00412
RNA predicted,0.665929,1.0,0.02841
observed,0.00412,0.02841,1.0


In [72]:
expr_dict[genes[2]].corr(method='spearman')

Unnamed: 0,CAGE predicted,RNA predicted,observed
CAGE predicted,1.0,0.481421,0.022248
RNA predicted,0.481421,1.0,0.029495
observed,0.022248,0.029495,1.0


In [13]:
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
colors = sns.color_palette("pastel")
def scatter_plot(x, y, xlabel, ylabel, title, corr):
    plt.scatter(x, y, marker='o', color=colors[0], label=f"Correlation: {corr:.3f}")

    # Add labels and title
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(gene)

    # Show the plot
    plt.legend()
    plt.show()



def plot_spearman_corr(gene):
    x = expr_dict[gene]["observed"]
    y = expr_dict[gene]["human predicted"]
    corr, pval = stats.spearmanr(x, y)
    #plt.scatter(x, y, marker='o', color='blue', label=f"Correlation: {corr:.3f}\nP-value: {pval}")
    plt.scatter(stats.rankdata(x), stats.rankdata(y), marker='o', color=colors[1], label=f"Correlation: {corr:.3f}")

    # Add labels and title
    plt.xlabel('Observed Ranks')
    plt.ylabel('Predicted Ranks')
    plt.title(gene)

    # Show the plot
    plt.legend()
    plt.show()

{'ENSG00000142949':          CAGE predicted  RNA predicted
 NA20521       20.140625   30171.421875
 NA18934       20.171875   31623.371094
 HG00324       20.062500   31657.937500
 HG00103       20.281250   32049.599609
 NA20516       20.156250   31728.843750
 ...                 ...            ...
 HG00152       20.093750   30087.642578
 HG00351       20.218750   31902.732422
 HG00185       20.218750   31106.968750
 NA06994       20.250000   32102.337891
 NA12341       20.125000   31183.824219
 
 [455 rows x 2 columns],
 'ENSG00000133247':          CAGE predicted  RNA predicted
 NA20521       22.968750   61911.656250
 NA18934       23.000000   62215.453125
 HG00324       22.859375   60991.593750
 HG00103       22.968750   61437.328125
 NA20516       22.906250   61042.312500
 ...                 ...            ...
 HG00152       22.843750   60993.703125
 HG00351       22.875000   61338.320312
 HG00185       22.968750   61601.574219
 NA06994       22.796875   61072.781250
 NA12341       