---
title: Correlation between human reference CAGE predictions to mean expression across GTEx genes
author: Sabrina Mi
date: 8/22/23
---

In [None]:
hg38_annot = pd.read_csv("protein_coding_TSS.tsv", header=0, sep="\t", index_col='ensembl_gene_id')
gex_df = pd.DataFrame({"enformer": CAGE_predictions}, index=hg38_annot.index)
gex_df.head()

In [None]:
gtex_tpm = pd.read_csv("gene_tpm_2017-06-05_v8_brain_cortex.gct.gz", header=2, sep="\t")
gtex_tpm['Name'] = gtex_tpm['Name'].apply(lambda gene: gene.split('.')[0])
gtex_tpm.set_index('Name', inplace=True)

In [None]:
gene_list = gex_df.index.intersection(gtex_tpm.index)
gtex_tpm = gtex_tpm.loc[gene_list]
print(gtex_tpm.shape[0], "genes in both GTEx and BioMart datasets")

In [None]:
# Calculate average gene expression
gtex_mean_tpm = gtex_tpm.drop(columns=['id', 'Description']).mean(axis=1)
gtex_mean_tpm.name = 'gtex'

In [None]:
# Join observed gene expression with Enformer CAGE predicted
gex_df = gex_df.merge(gtex_mean_tpm, left_index=True, right_index=True, how='inner').dropna()
gex_df.head()
