In [1]:
import pandas as pd
import numpy as np

Gene sets defined by gene ontology for each GO domain is downloaded from msigdb: https://www.gsea-msigdb.org/gsea/msigdb/human/collections.jsp#C5

The three files are downloaded from the following links and stored in the `input/` folder:
- GO Biological Process ontology: https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2023.1.Hs/c5.go.bp.v2023.1.Hs.symbols.gmt
- GO Cellular Component ontology: https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2023.1.Hs/c5.go.cc.v2023.1.Hs.symbols.gmt
- GO Molecular Function ontology: https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2023.1.Hs/c5.go.mf.v2023.1.Hs.symbols.gmt

In [2]:
# Biological Process

with open("input/c5.go.bp.v2023.1.Hs.symbols.gmt", "r") as fopen:
    bp = fopen.readlines()

bp_df = pd.DataFrame()

for line in bp:
    cols = line.rstrip().split("\t")
    bp_df = pd.concat(
        [
            bp_df,
            pd.DataFrame(
                {"go_bp": cols[0], "gene_list": "|".join(cols[2:])}, index=[0]
            ),
        ],
        ignore_index=True,
    )

bp_df = (
    bp_df.assign(gene=lambda x: x.gene_list.str.split("|"))
    .explode("gene")
    .drop("gene_list", axis=1)
    .groupby("gene")
    .go_bp.apply(lambda x: "|".join(list(np.unique(x))))
    .reset_index()
    .rename(columns={"go_bp": "go_bp_list"})
)

In [3]:
# Cellular Component

with open("input/c5.go.cc.v2023.1.Hs.symbols.gmt", "r") as fopen:
    cc = fopen.readlines()

cc_df = pd.DataFrame()

for line in cc:
    cols = line.rstrip().split("\t")
    cc_df = pd.concat(
        [
            cc_df,
            pd.DataFrame(
                {"go_cc": cols[0], "gene_list": "|".join(cols[2:])}, index=[0]
            ),
        ],
        ignore_index=True,
    )

cc_df = (
    cc_df.assign(gene=lambda x: x.gene_list.str.split("|"))
    .explode("gene")
    .drop("gene_list", axis=1)
    .groupby("gene")
    .go_cc.apply(lambda x: "|".join(list(np.unique(x))))
    .reset_index()
    .rename(columns={"go_cc": "go_cc_list"})
)

In [4]:
# Molecular Function

with open("input/c5.go.mf.v2023.1.Hs.symbols.gmt", "r") as fopen:
    mf = fopen.readlines()

mf_df = pd.DataFrame()

for line in mf:
    cols = line.rstrip().split("\t")
    mf_df = pd.concat(
        [
            mf_df,
            pd.DataFrame(
                {"go_mf": cols[0], "gene_list": "|".join(cols[2:])}, index=[0]
            ),
        ],
        ignore_index=True,
    )

mf_df = (
    mf_df.assign(gene=lambda x: x.gene_list.str.split("|"))
    .explode("gene")
    .drop("gene_list", axis=1)
    .groupby("gene")
    .go_mf.apply(lambda x: "|".join(list(np.unique(x))))
    .reset_index()
    .rename(columns={"go_mf": "go_mf_list"})
)

In [5]:
# Combine the dataframes

df = bp_df.merge(cc_df, on="gene", how="outer").merge(mf_df, on="gene", how="outer")

df.to_csv("output/gene-ontology-annotations.tsv.gz", sep='\t', index=False)