# Ensembl gene version statistics

This notebook calculates several statistics for the total number of (unique) genes, added, and deleted genes per ensembl version.

In [7]:
import pandas as pd

In [8]:
dfs = [
    pd.read_parquet(f"df_human__ensembl__release-{version}__Gene.parquet")
    for version in [110, 111, 112]
]
df_110, df_111, df_112 = dfs

df_sets = [set(df["ensembl_gene_id"]) for df in dfs]
df_110_set, df_111_set, df_112_set = df_sets

In [9]:
for release, df in zip(["110", "111", "112"], [df_110, df_111, df_112]):
    print(
        f"Total number of (potentially non-unique) genes {len(df)} in ensembl {release}"
    )

Total number of (potentially non-unique) genes 75719 in ensembl 110
Total number of (potentially non-unique) genes 76062 in ensembl 111
Total number of (potentially non-unique) genes 75829 in ensembl 112


In [10]:
for release, df_set in zip(["110", "111", "112"], [df_110_set, df_111_set, df_112_set]):
    print(f"Total number of unique genes {len(df_set)} in ensembl {release}")

Total number of unique genes 70116 in ensembl 110
Total number of unique genes 70711 in ensembl 111
Total number of unique genes 70611 in ensembl 112


In [11]:
print(
    f"Removed genes: Unique for 110 when comparing 110 and 111: {len(df_110_set - df_111_set)}"
)
print(
    f"Removed genes: Unique for 111 when comparing 111 and 112: {len(df_111_set - df_112_set)}"
)

Removed genes: Unique for 110 when comparing 110 and 111: 50
Removed genes: Unique for 111 when comparing 111 and 112: 127


In [12]:
print(
    f"Added genes: Unique for 111 when comparing 111 and 110: {len(df_111_set - df_110_set)}"
)
print(
    f"Added genes: Unique for 112 when comparing 112 and 111: {len(df_112_set - df_111_set)}"
)

Added genes: Unique for 111 when comparing 111 and 110: 645
Added genes: Unique for 112 when comparing 112 and 111: 27
