In [20]:
import os
import pandas as pd
import requests
from gprofiler import GProfiler  # Ensure you have installed gprofiler-official
from RAG_workflow import initialize_gene_list, extract_gene_descriptions

def query_gprofiler_rest(genes, organism="rnorvegicus", user_threshold=0.05):
    url = "https://biit.cs.ut.ee/gprofiler/api/gost/profile/"
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"]
    payload = {
        "organism": organism,
        "query": genes,
        "user_threshold": user_threshold,
        "sources": sources,
        "significance_threshold_method": "g_SCS",
        "no_iea": False,
        "domain_scope": "annotated",
        "output": "json",
        "no_evidences": False
    }
    headers = {"User-Agent": "FullPythonRequest"}
    response = requests.post(url, json=payload, headers=headers)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data["result"])
    return df

def query_gprofiler(genes, organism="rnorvegicus", user_threshold=0.05):
    gp = GProfiler(return_dataframe=True)
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"]
    res = gp.profile(
        organism=organism,
        query=genes,
        user_threshold=user_threshold,
        sources=sources,
        significance_threshold_method="g_SCS",
        no_iea=False,
        domain_scope="annotated",
        no_evidences=False
    )
    if "intersection" in res.columns:
        res.rename(columns={"intersection": "intersections"}, inplace=True)
    return res

def filter_best_per_parent(df):
    required_cols = {'p_value', 'parents', 'native'}
    if not required_cols.issubset(df.columns):
        return df
    df_sorted = df.sort_values(by='p_value', ascending=True).reset_index(drop=True)
    kept_terms = set()
    rows_to_keep = []
    for idx, row in df_sorted.iterrows():
        term_id = row['native']
        parent_ids = row['parents'] if isinstance(row['parents'], list) else []
        if any(parent in kept_terms for parent in parent_ids):
            continue
        rows_to_keep.append(idx)
        kept_terms.add(term_id)
    return df_sorted.loc[rows_to_keep].sort_values(by='p_value', ascending=True)

if __name__ == "__main__":
    gene_list_string, regulation, num_genes = initialize_gene_list()
    print(f"Number of genes: {num_genes}")
    gene_descriptions = extract_gene_descriptions(gene_list_string)
    
    genes = [gene.strip() for gene_list in gene_list_string.split(',') 
             for gene in gene_list.split() if gene.strip()]
    
    if genes:
        try:
            df_pkg = query_gprofiler(genes=genes, organism="rnorvegicus", user_threshold=0.05)
            
            df_api = query_gprofiler_rest(genes=genes, organism="rnorvegicus", user_threshold=0.05)
            df_api_sig = df_api[df_api["p_value"] < 0.05].copy()
            df_api_best = filter_best_per_parent(df_api_sig)
            
            df_api_best.rename(columns={
                "name": "Pathway", 
                "native": "annotation term", 
                "p_value": "p-value"
            }, inplace=True)
            
            df_merged = pd.merge(
                df_api_best,
                df_pkg[['native', 'intersections']],
                left_on="annotation term",
                right_on="native",
                how="left",
                suffixes=('_api', '_pkg')
            )
            df_merged.drop(columns=["native"], inplace=True, errors='ignore')
            
            if 'intersections_pkg' in df_merged.columns:
                df_merged['genes'] = df_merged['intersections_pkg'].apply(
                    lambda x: ', '.join(x) if isinstance(x, list) else x if pd.notnull(x) else ''
                )
            else:
                df_merged['genes'] = df_merged['intersections_api'].apply(
                    lambda x: ', '.join(x) if isinstance(x, list) else x if pd.notnull(x) else ''
                )
            
            df_final = df_merged[["Pathway", "description", "annotation term", "source", "p-value", "genes"]]
            
            output_path = "./output/text_files/ground_truth_pathways.txt"
            df_final[["Pathway", "annotation term", "genes"]].to_csv(output_path, sep="\t", index=False)
            print(f"Output written to {output_path}")
            
        except requests.HTTPError as e:
            print(f"HTTP error occurred: {e}")
        except Exception as e:
            print(f"An error occurred: {e}")


Number of genes: 250
Total genes to process: 250
Genes: {'Nrp1', 'Kif1a', 'Pmp22', 'Magea9', 'Taf9b', 'Inpp5a', 'Cpe', 'Chst2', 'Manba', 'Vps26c', 'Ecel1', 'Ckap4', 'Icoslg', 'Defb29', 'Slitrk6', 'Cd9', 'Bgn', 'Tspan5', 'Trim35', 'Col15a1', 'Pals1', 'Nlrp1a', 'Piezo2', 'Prss12', 'Ighm', 'Parm1', 'Fam20c', 'Hmx3', 'Cdca7l', 'Serpini1', 'Map1lc3a', 'Snx4', 'Pltp', 'Gldn', 'Stard13', 'Aig1', 'Prss23', 'A2m', 'Selplg', 'Wnt5a', 'Cxcl14', 'Sema4f', 'Col20a1', 'Man1a1', 'Myo1d', 'Acta2', 'Ltbp1', 'Psmd2', 'Igfbp3', 'Tril', 'Sema3g', 'Serinc5', 'Ppp1r14c', 'Cep295nl', 'Cpm', 'Tent5c', 'Plxnb1', 'Rcn3', 'Synpr', 'Tpcn1', 'Lef1', 'Adamts17', 'Cobll1', 'Vopp1', 'Sgpl1', 'Grb14', 'Magea4', 'Dhrs1', 'Tex101', 'Sipa1l1', 'Gtf2e1', 'Steap1', 'Hsp90aa1', 'Nfatc1', 'Ntn1', 'Mybl1', 'Emilin1', 'Gja1', 'Ppfia4', 'Rnf144a', 'Hadhb', 'Parp14', 'Ano5', 'Tagln', 'Hspa12a', 'Peg3', 'P2ry2', 'Gprc5b', 'Cpsf2', 'Msi1', 'Ctbs', 'Cxcl12', 'Mfap5', 'Pfn2', 'St3gal5', 'Abcb4', 'Tbccd1', 'Phldb2', 'Pgm1', 'Klhdc2',

In [21]:
df_final

Unnamed: 0,Pathway,description,annotation term,source,p-value,genes
0,cell periphery,"""The broad region around and including the pla...",GO:0071944,GO:CC,4.299074e-13,"Cntn6, Prss12, Tex101, Sema4f, Hapln1, Parm1, ..."
1,multicellular organismal process,"""Any biological process, occurring at the leve...",GO:0032501,GO:BP,3.928781e-10,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Igfb..."
2,extracellular region,"""The space external to the outermost structure...",GO:0005576,GO:CC,2.623669e-09,"Prss12, Pcsk5, Tex101, Qsox1, Htra1, Igfbp3, A..."
3,regulation of cell motility,"""Any process that modulates the frequency, rat...",GO:2000145,GO:BP,1.567922e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
4,system development,"""The process whose specific outcome is the pro...",GO:0048731,GO:BP,2.0751e-08,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Ccn3..."
5,regulation of locomotion,"""Any process that modulates the frequency, rat...",GO:0040012,GO:BP,4.528831e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
6,locomotion,"""Self-propelled movement of a cell or organism...",GO:0040011,GO:BP,9.629089e-08,"Pcsk5, Tex101, Emp2, Chst2, Igfbp3, Ccn3, Ntn1..."
7,anatomical structure development,"""The biological process whose specific outcome...",GO:0048856,GO:BP,1.221614e-07,"Aldh1a1, Pcsk5, Cpe, Col9a3, Emp2, Acta2, Htra..."
8,cell migration,"""The controlled self-propelled movement of a c...",GO:0016477,GO:BP,3.510191e-07,"Pcsk5, Emp2, Chst2, Igfbp3, Ccn3, Ntn1, Cxcl14..."
9,vesicle,"""Any small, fluid-filled, spherical organelle ...",GO:0031982,GO:CC,4.542548e-07,"Prss12, Tex101, Qsox1, Cpe, Parm1, Emp2, Synpr..."
