In [1]:
import pandas as pd
import numpy as np

Gene sets defined by gene ontology for each GO domain is downloaded from msigdb: https://www.gsea-msigdb.org/gsea/msigdb/human/collections.jsp#C2

The file is downloaded from the following link and stored in the `input/` folder:
https://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2023.2.Hs/c2.cp.wikipathways.v2023.2.Hs.symbols.gmt

In [2]:
with open("input/c2.cp.wikipathways.v2023.2.Hs.symbols.gmt", "r") as fopen:
    wp = fopen.readlines()

wp_df = pd.DataFrame()

for line in wp:
    cols = line.rstrip().split("\t")
    wp_df = pd.concat(
        [
            wp_df,
            pd.DataFrame(
                {"wikipathway": cols[0], "gene_list": "|".join(cols[2:])}, index=[0]
            ),
        ],
        ignore_index=True,
    )

wp_df = (
    wp_df.assign(gene=lambda x: x.gene_list.str.split("|"))
    .explode("gene")
    .drop("gene_list", axis=1)
    .groupby("gene")
    .wikipathway.apply(lambda x: "|".join(list(np.unique(x))))
    .reset_index()
)

In [3]:
# Combine the dataframes

wp_df.to_csv("output/wikipathway-annotations.tsv.gz", sep='\t', index=False)