We have datasets aligned using GRCh38 and GRCH19. We need to include only the genes present in both reference genomes.

In [1]:
from bioservices import *
import pandas as pd
from pyensembl import EnsemblRelease

# can use `pyensembl list` from command line to see releases I have downloaded and their reference genome

In [6]:
shared_genes_path = "../../data/reference/shared_genes.tsv"

In [2]:
# Load Ensembl release for GRCh38 and GRCh37(hg19) assemblies
ensembl_GRCh38 = EnsemblRelease(96)
ensembl_GRCh37 = EnsemblRelease(75)

# Get gene IDs and names for both assemblies
genes_GRCh38 = ensembl_GRCh38.genes()
genes_GRCh37 = ensembl_GRCh37.genes()

In [3]:
# Extract gene names from each assembly
gene_names_GRCh38 = set(gene.name for gene in genes_GRCh38)
gene_names_GRCh37 = set(gene.name for gene in genes_GRCh37)
all_gene_names = gene_names_GRCh38.union(gene_names_GRCh37)

# Create a data frame of merged gene names
df_genes = pd.DataFrame(list(all_gene_names), columns=["Gene"])
df_genes["in_GRCh38"] = df_genes["Gene"].isin(gene_names_GRCh38)
df_genes["in_GRCh37"] = df_genes["Gene"].isin(gene_names_GRCh37)
df_genes["in_both"] = df_genes["in_GRCh38"] & df_genes["in_GRCh37"]

num_unique_GRCh38 = sum(df_genes["in_GRCh38"] & ~df_genes["in_GRCh37"])
num_unique_GRCh37 = sum(df_genes["in_GRCh37"] & ~df_genes["in_GRCh38"])
num_both = sum(df_genes["in_both"])
num_total = len(df_genes)
print(
    f"Number of genes unique to GRCh38: {num_unique_GRCh38}"
)
print(
    f"Number of genes unique to GRCh37: {num_unique_GRCh37}"
)
print(f"Number of genes in both assemblies: {num_both}")
print(f"Number of genes in either assembly: {num_total}")
assert num_total == num_unique_GRCh38 + num_unique_GRCh37 + num_both
display(df_genes)

Number of genes unique to GRCh38: 23051
Number of genes unique to GRCh37: 22336
Number of genes in both assemblies: 34302
Number of genes in either assembly: 79689


Unnamed: 0,Gene,in_GRCh38,in_GRCh37,in_both
0,NUP210,True,True,True
1,AC067940.1,True,True,True
2,CFAP97D2,True,False,False
3,AIM2,True,True,True
4,AL135999.2,True,False,False
...,...,...,...,...
79684,AC009473.1,False,True,False
79685,RP11-114H20.1,False,True,False
79686,OXR1,True,True,True
79687,RP11-573D15.9,False,True,False


In [8]:
shared_genes = df_genes[df_genes["in_both"]]["Gene"]
shared_genes.to_csv(shared_genes_path, sep="\t", index=False, header=False)