In [9]:
import os
import csv
import gzip
import pandas as pd
import requests
from RAG_workflow import initialize_gene_list, extract_gene_descriptions

def query_gprofiler_rest(genes, organism="rnorvegicus", user_threshold=0.05):
    url = "https://biit.cs.ut.ee/gprofiler/api/gost/profile/"
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"]
    payload = {
        "organism": organism,
        "query": genes,
        "user_threshold": user_threshold,
        "sources": sources,
        "significance_threshold_method": "g_SCS",
        "no_iea": False,
        "domain_scope": "annotated",
        "output": "json"
    }
    headers = {"User-Agent": "FullPythonRequest"}
    response = requests.post(url, json=payload, headers=headers)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data["result"])
    return df

def filter_best_per_parent(df):
    required_cols = {'p_value', 'parents', 'native'}
    if not required_cols.issubset(df.columns):
        return df
    df_sorted = df.sort_values(by='p_value', ascending=True).reset_index(drop=True)
    kept_terms = set()
    rows_to_keep = []
    for idx, row in df_sorted.iterrows():
        term_id = row['native']
        parent_ids = row['parents'] if isinstance(row['parents'], list) else []
        if any(parent in kept_terms for parent in parent_ids):
            continue
        rows_to_keep.append(idx)
        kept_terms.add(term_id)
    return df_sorted.loc[rows_to_keep].sort_values(by='p_value', ascending=True)

if __name__ == "__main__":
    gene_list_string, regulation, num_genes = initialize_gene_list()
    print(f"The amount of genes {num_genes}")
    
    gene_descriptions = extract_gene_descriptions(gene_list_string)
    
    genes = [gene.strip() for gene in gene_list_string.split(',') if gene.strip()]
    
    if genes:
        try:
            df = query_gprofiler_rest(genes=genes, organism="rnorvegicus", user_threshold=0.05)
            df_sig = df[df["p_value"] < 0.05].copy()
            df_best = filter_best_per_parent(df_sig)
            df_best.rename(columns={"name": "Pathway", "native": "annotation term", "p_value": "p-value"}, inplace=True)
            df_final = df_best[["Pathway", "description", "annotation term", "source", "p-value"]]
            output_path = "./output/text_files/ground_truth_pathways.txt"
            df_final[["Pathway", "annotation term"]].to_csv(output_path, sep="\t", index=False)
        except requests.HTTPError:
            pass


The amount of genes 250
Total genes to process: 250
Genes: {'Ccn4', 'Lrrtm3', 'Marveld1', 'Mef2c', 'Tril', 'Drp2', 'Ighm', 'Atf5', 'Steap2', 'Tmem26', 'Dpysl5', 'Matn3', 'Ppm1f', 'Selplg', 'Nes', 'Pgm1', 'Rrm2', 'Nfatc1', 'Nek9', 'Ghr', 'Bace2', 'Prss23', 'Mag', 'Igf2bp2', 'Hs6st1', 'Arsb', 'Chmp2b', 'Robo2', 'Defb29', 'Lrp2', 'Pcsk5', 'Vps26c', 'Tnik', 'Klf9', 'Slc22a17', 'Gprc5a', 'Septin4', 'Poglut1', 'Sema4f', 'Efemp2', 'Acap2', 'Taf9b', 'Ccn3', 'Arsg', 'Cxcl12', 'Efna5', 'Msi1', 'Slc35a2', 'Sncg', 'Vps8', 'Rnf144a', 'Aff3', 'Pfn2', 'Gldn', 'Manba', 'Ecel1', 'Phldb2', 'Asap1', 'P2ry2', 'Mal', 'Map1lc3a', 'Itpripl1', 'Adamtsl1', 'ENSRNOG00000062503', 'Cobll1', 'Hsp90aa1', 'Igfbp3', 'Cggbp1', 'Aif1l', 'Gldc', 'Ryr3', 'Abcb4', 'Dpysl3', 'Tf', 'Ctbs', 'Tbccd1', 'Pkp1', 'Papola', 'Adamtsl3', 'Piezo2', 'Slc9a9', 'Psmd2', 'Rcn3', 'St3gal5', 'Ckap4', 'Klhdc2', 'Hspa12a', 'Magec2l1', 'Ppp1r36', 'Ebf1', 'Hs6st2', 'Smc6', 'Vrk1', 'Wnt5a', 'Gypc', 'Aig1', 'Cldn19', 'Id2', 'Srgap1', 'Ppfia4', '

In [10]:
df_final

Unnamed: 0,Pathway,description,annotation term,source,p-value
0,cell periphery,"""The broad region around and including the pla...",GO:0071944,GO:CC,4.299074e-13
1,multicellular organismal process,"""Any biological process, occurring at the leve...",GO:0032501,GO:BP,3.928781e-10
2,extracellular region,"""The space external to the outermost structure...",GO:0005576,GO:CC,2.623669e-09
4,regulation of cell motility,"""Any process that modulates the frequency, rat...",GO:2000145,GO:BP,1.567922e-08
6,system development,"""The process whose specific outcome is the pro...",GO:0048731,GO:BP,2.0751e-08
7,regulation of locomotion,"""Any process that modulates the frequency, rat...",GO:0040012,GO:BP,4.528831e-08
9,locomotion,"""Self-propelled movement of a cell or organism...",GO:0040011,GO:BP,9.629089e-08
10,anatomical structure development,"""The biological process whose specific outcome...",GO:0048856,GO:BP,1.221614e-07
11,cell migration,"""The controlled self-propelled movement of a c...",GO:0016477,GO:BP,3.510191e-07
12,vesicle,"""Any small, fluid-filled, spherical organelle ...",GO:0031982,GO:CC,4.542548e-07
