In [1]:
import os
import pandas as pd
import requests
from RAG_workflow import initialize_gene_list, extract_gene_descriptions

def query_gprofiler_rest(genes, organism="rnorvegicus", user_threshold=0.05):
    url = "https://biit.cs.ut.ee/gprofiler/api/gost/profile/"
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"]
    payload = {
        "organism": organism,
        "query": genes,
        "user_threshold": user_threshold,
        "sources": sources,
        "significance_threshold_method": "g_SCS",
        "no_iea": False,
        "domain_scope": "annotated",
        "output": "json",
        "no_evidences": False
    }
    headers = {"User-Agent": "FullPythonRequest"}
    response = requests.post(url, json=payload, headers=headers)
    response.raise_for_status()
    data = response.json()
    
    # Replicate the package's transformation of the intersections field:
    if not payload.get("no_evidences", True):
        meta = data.get("meta", {})
        if meta and "query_metadata" in meta and "genes_metadata" in meta:
            reverse_mappings = {}
            # Build reverse mapping for each query in the metadata
            queries = meta["query_metadata"]["queries"].keys()
            for query in queries:
                mapping = meta["genes_metadata"]["query"][query]["mapping"]
                reverse_mapping = {}
                for k, v in mapping.items():
                    if len(v) == 1:
                        reverse_mapping[v[0]] = k
                    else:
                        for i in v:
                            reverse_mapping[i] = i
                reverse_mappings[query] = reverse_mapping
            # Update each result's intersections to contain gene names
            for result in data["result"]:
                query_id = result["query"]
                if query_id not in meta["genes_metadata"]["query"]:
                    continue
                mapping = reverse_mappings[query_id]
                ens_genes = meta["genes_metadata"]["query"][query_id]["ensgs"]
                gene_names = [mapping.get(gene_id, gene_id) for gene_id in ens_genes]
                # Replace each truthy value in intersections with the corresponding gene name
                result["evidences"] = [ev for ev in result["intersections"] if ev]
                result["intersections"] = [gene for ev, gene in zip(result["intersections"], gene_names) if ev]
    
    df = pd.DataFrame(data["result"])
    return df

def flatten_list(nested_list):
    """Recursively flattens a nested list."""
    flat = []
    for item in nested_list:
        if isinstance(item, list):
            flat.extend(flatten_list(item))
        else:
            flat.append(item)
    return flat

def filter_best_per_parent(df):
    required_cols = {'p_value', 'parents', 'native'}
    if not required_cols.issubset(df.columns):
        return df
    df_sorted = df.sort_values(by='p_value', ascending=True).reset_index(drop=True)
    kept_terms = set()
    rows_to_keep = []
    for idx, row in df_sorted.iterrows():
        term_id = row['native']
        parent_ids = row['parents'] if isinstance(row['parents'], list) else []
        if any(parent in kept_terms for parent in parent_ids):
            continue
        rows_to_keep.append(idx)
        kept_terms.add(term_id)
    return df_sorted.loc[rows_to_keep].sort_values(by='p_value', ascending=True)

if __name__ == "__main__":
    gene_list_string, regulation, num_genes = initialize_gene_list()
    print(f"Number of genes: {num_genes}")
    gene_descriptions = extract_gene_descriptions(gene_list_string)
    
    genes = [gene.strip() for gene_list in gene_list_string.split(',') 
             for gene in gene_list.split() if gene.strip()]
    
    if genes:
        try:
            # Query g:Profiler using the REST API
            df_api = query_gprofiler_rest(genes=genes, organism="rnorvegicus", user_threshold=0.05)
            
            # Filter for significant results (p-value < 0.05)
            df_api_sig = df_api[df_api["p_value"] < 0.05].copy()
            
            # Filter best results per parent term
            df_api_best = filter_best_per_parent(df_api_sig)
            
            # Rename columns as desired
            df_api_best.rename(columns={
                "name": "Pathway", 
                "native": "annotation term", 
                "p_value": "p-value"
            }, inplace=True)
            
            # Build a 'genes' column from intersections, flattening nested lists if needed
            if "intersections" in df_api_best.columns:
                df_api_best['genes'] = df_api_best['intersections'].apply(
                    lambda x: ', '.join(map(str, flatten_list(x))) if isinstance(x, list) else str(x) if pd.notnull(x) else ''
                )
            else:
                df_api_best['genes'] = ''
            
            # Final DataFrame with desired columns:
            df_final = df_api_best[["Pathway", "description", "annotation term", "source", "p-value", "genes"]]
            os.makedirs("./output/text_files", exist_ok=True)
            output_path = "./output/text_files/ground_truth_pathways.txt"
            df_final[["Pathway", "annotation term", "genes"]].to_csv(output_path, sep="\t", index=False)
            print(f"Output written to {output_path}")
            
        except requests.HTTPError as e:
            print(f"HTTP error occurred: {e}")
        except Exception as e:
            print(f"An error occurred: {e}")



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mghui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mghui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of genes: 250
Total genes to process: 250
Genes: {'Gprc5a', 'Snx6', 'Ifnar1', 'Met', 'Prss12', 'Col8a2', 'Septin4', 'Plxnb1', 'Ppp1r14c', 'Emilin1', 'Hmx3', 'Pam', 'Mageb16', 'Itpripl1', 'Prx', 'Mpz', 'Hs6st1', 'Cntn6', 'Serpini1', 'ENSRNOG00000062503', 'Stard13', 'Prss23', 'Dpysl5', 'Magea4', 'Fam20c', 'Tnik', 'Sema3g', 'Psmd2', 'Vps26c', 'Pfn2', 'Gja1', 'Has2', 'Pals1', 'Col15a1', 'Pcsk5', 'Id2', 'Ighm', 'Tspan5', 'Klf9', 'Ano5', 'Pappa', 'Smc6', 'Pgm1', 'Cxcl14', 'Itga7', 'Aldh1a1', 'Gcnt1', 'Ecel1', 'Cpm', 'Icoslg', 'Hadhb', 'Ctbs', 'Cd9', 'Hsp90aa1', 'Slc22a17', 'Efna5', 'Dyrk1a', 'Atf5', 'Ppfia4', 'Map1lc3a', 'A2m', 'Sgpl1', 'Slc35a2', 'Apod', 'Nlrp1a', 'Adamtsl3', 'Ebf1', 'Igf2bp2', 'Gldc', 'Steap1', 'Slco3a1', 'Ppp1r36', 'Slc39a13', 'Magec2l1', 'Magea9', 'Cpe', 'Nes', 'Robo2', 'Taf9b', 'Chmp2b', 'Zfyve28', 'Fut10', 'Bace2', 'Tex101', 'Marveld1', 'St3gal5', 'Col11a1', 'Sipa1l1', 'Cenpo', 'Qsox1', 'Steap2', 'Tril', 'Dpysl3', 'Arsb', 'Lrp2', 'Efemp2', 'Adamtsl1', 'C1qtnf1',

The following genes were not found in the gene data file: ENSRNOG00000062503, ENSRNOG00000063649


Output written to ./output/text_files/ground_truth_pathways.txt


In [2]:
df_final

Unnamed: 0,Pathway,description,annotation term,source,p-value,genes
0,cell periphery,"""The broad region around and including the pla...",GO:0071944,GO:CC,4.299074e-13,"Cntn6, Prss12, Tex101, Sema4f, Hapln1, Parm1, ..."
1,multicellular organismal process,"""Any biological process, occurring at the leve...",GO:0032501,GO:BP,3.928781e-10,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Igfb..."
2,extracellular region,"""The space external to the outermost structure...",GO:0005576,GO:CC,2.623669e-09,"Prss12, Pcsk5, Tex101, Qsox1, Htra1, Igfbp3, A..."
4,regulation of cell motility,"""Any process that modulates the frequency, rat...",GO:2000145,GO:BP,1.567922e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
6,system development,"""The process whose specific outcome is the pro...",GO:0048731,GO:BP,2.0751e-08,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Ccn3..."
7,regulation of locomotion,"""Any process that modulates the frequency, rat...",GO:0040012,GO:BP,4.528831e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
9,locomotion,"""Self-propelled movement of a cell or organism...",GO:0040011,GO:BP,9.629089e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
10,anatomical structure development,"""The biological process whose specific outcome...",GO:0048856,GO:BP,1.221614e-07,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Htra..."
11,cell migration,"""The controlled self-propelled movement of a c...",GO:0016477,GO:BP,3.510191e-07,"Pcsk5, Emp2, Chst2, Igfbp3, Ccn3, Ntn1, Cxcl14..."
12,vesicle,"""Any small, fluid-filled, spherical organelle ...",GO:0031982,GO:CC,4.542548e-07,"Prss12, Tex101, Qsox1, Cpe, Parm1, Emp2, Synpr..."
