In [12]:
from unicodedata import category

import yaml

from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer

DB = "mongodb://localhost:27017/metabolights"
CN = "maf"

In [13]:
!pip install oaklib



In [14]:
from oaklib import get_adapter
chebi_adapter = get_adapter("sqlite:obo:chebi")

In [15]:
from linkml_store import Client

client = Client()
db = client.attach_database(DB)
collection = db.get_collection(CN)
study_collection = db.get_collection("study")

In [40]:
def get_info(study: dict):
    inv = study.get("isaInvestigation", {})
    studies = inv.get("studies", [])
    s0 = studies[0] if studies else {}
    title = s0.get("title")
    desc = s0.get("description")
    if title and title != "Investigation":
        return title, desc
    title = inv.get("title")
    desc = inv.get("description")
    return title, desc
    
    

In [41]:
study = study_collection.find(limit=1).rows[0]
print(get_info(study))

('A metabolomic study of urinary changes in type 2 diabetes in human compared to the control group', 'Type 2 diabetes mellitus is the result of a combination of impaired insulin secretion with reduced insulin sensitivity of target tissues. There are an estimated 150 million affected individuals worldwide, of whom a large proportion remains undiagnosed because of a lack of specific symptoms early in this disorder and inadequate diagnostics. In this study, NMR-based metabolomic analysis in conjunction with uni- and multivariate statistics was applied to examine the urinary metabolic changes in Human type 2 diabetes mellitus patients compared to the control group. The human population were un medicated diabetic patients who have good daily dietary control over their blood glucose concentrations by following the guidelines on diet issued by the American Diabetes Association. Note: This is part of a larger study, please refer to the original paper below.')


In [57]:
def study_id_from_path(path: str):
    if "/" in path:
        # e.g. foo/bar/MTBLS1/FILE.json
        return path.split("/")[-2]
    return path
    

def study_by_id(study_id: str):
    if "/" in study_id:
        study_id = study_id_from_path(study_id)
    # isaInvestigation.identifier
    rows = study_collection.find({"isaInvestigation.identifier": study_id}).rows
    if rows:
        return rows[0]
    return None

get_info(study_by_id("MTBLS1"))

('A metabolomic study of urinary changes in type 2 diabetes in human compared to the control group',
 'Type 2 diabetes mellitus is the result of a combination of impaired insulin secretion with reduced insulin sensitivity of target tissues. There are an estimated 150 million affected individuals worldwide, of whom a large proportion remains undiagnosed because of a lack of specific symptoms early in this disorder and inadequate diagnostics. In this study, NMR-based metabolomic analysis in conjunction with uni- and multivariate statistics was applied to examine the urinary metabolic changes in Human type 2 diabetes mellitus patients compared to the control group. The human population were un medicated diabetic patients who have good daily dietary control over their blood glucose concentrations by following the guidelines on diet issued by the American Diabetes Association. Note: This is part of a larger study, please refer to the original paper below.')

In [62]:
def info_by_id(study_id: str):
    study = study_by_id(study_id)
    if not study:
        return None, None
    else:
        return get_info(study)

In [63]:
info_by_id("MTBLS138")

('Multi-omics phenotyping of the gut-liver axis reveals metabolic perturbations from a low-dose pesticide mixture in rats',
 'Health effects of pesticides are not always accurately detected using the current battery of regulatory toxicity tests. We compared standard histopathology and serum biochemistry measures and multi-omics analyses in a subchronic toxicity test of a mixture of six pesticides frequently detected in foodstuffs (azoxystrobin, boscalid, chlorpyrifos, glyphosate, imidacloprid and thiabendazole) in Sprague-Dawley rats. Analysis of water and feed consumption, body weight, histopathology and serum biochemistry showed little effect. Contrastingly, serum and caecum metabolomics revealed that nicotinamide and tryptophan metabolism were affected, which suggested activation of an oxidative stress response. This was not reflected by gut microbial community composition changes evaluated by shotgun metagenomics. Transcriptomics of the liver showed that 257 genes had their express

In [16]:
ea = EnrichmentAnalyzer.from_collection(collection, sample_key="src", 
                                        classification_key="c3p_classifications")

In [17]:
sample_ids = ea.sample_ids

In [91]:
results = [] # for df
cases = [] # for llm-matrix
for sample in sample_ids:
    study_id = study_id_from_path(sample)
    study_title, desc = info_by_id(sample)
    if not study_title:
        continue
    file_name = sample.split("/")[-1].split(".")[0]
    # print(f"## {study_title}")
    ideals = []
    ers = []
    for r in ea.find_enriched_categories(sample, p_value_threshold=0.05):
        #print(f"## {study_title} -- {sample}")
        #print(f"      DESC: {desc}")
        #print(f" *  {chebi_adapter.label(r.category)} :: {r}")
        category_label = chebi_adapter.label(r.category)
        results.append({
            "study_id": study_id,
            "study": study_title,
            "description": desc,
            "sample_file": file_name,
            "category_label": category_label,
            **r.model_dump(),
        })
        ers.append(r.model_dump())
        ideals.append((r.adjusted_p_value, category_label))
    # sort by p-value
    ideals.sort()
    if ideals:
        enrichment_summary = "; ".join([f"{i+1}. {label}" for i, (_, label) in enumerate(ideals)])
        cases.append({
            "input": f"Title: {study_title}\nDescription: {desc}\nEnriched: {enrichment_summary}",
            "original_input": {
                "study_id": study_id,
                "study": study_title,
                "description": desc,
                "sample_file": file_name,
                "enrichment": ers,
            },
            "ideal": "YES",
        })

In [87]:
suite = {
    "name": "metabolights_c3po_enrichment",
    "template": "enrichment",
    "templates": {
        "enrichment": {
            "system": """I will give you a metabolomics study title and description,
            and also the ordered list of chemical classes that are enriched for that study.
            Tell me if the enrichment results make sense.
            Return YES, NO, or OTHER, followed by an explanation.
            The explanation can be of any form and length but your response must start with YES, NO, or OTHER.
            """,
            "prompt": '{input}',
            "metrics": ["qa_with_explanation"],
        },
    },
    "matrix": {
        "hyperparameters": {
            "model": ["claude-sonnet"]
        },
    },
    "cases": cases,
}

In [88]:
import yaml
with open("output/metabolights_enrichment_suite.yaml", "w") as f:
    f.write(yaml.dump(suite, sort_keys=False))

In [89]:
 import pandas as pd

df = pd.DataFrame(results)

In [90]:
df

Unnamed: 0,study_id,study,description,sample_file,category_label,category,fold_change,original_p_value,adjusted_p_value
0,MTBLS10416,A metabolic switch orchestrated by IL-18 and t...,Tissues are exposed to diverse inflammatory ch...,m_MTBLS10416_LC-MS_alternating_hilic_metabolit...,fatty acid anion,CHEBI:28868,46481.178571,2.724382e-14,2.179505e-13
1,MTBLS10416,A metabolic switch orchestrated by IL-18 and t...,Tissues are exposed to diverse inflammatory ch...,m_MTBLS10416_LC-MS_alternating_hilic_metabolit...,monocarboxylic acid anion,CHEBI:35757,22832.859649,3.331356e-09,1.332542e-08
2,MTBLS10416,A metabolic switch orchestrated by IL-18 and t...,Tissues are exposed to diverse inflammatory ch...,m_MTBLS10416_LC-MS_alternating_hilic_metabolit...,tricarboxylic acid,CHEBI:27093,1191.825092,8.387285e-04,2.236609e-03
3,MTBLS10416,A metabolic switch orchestrated by IL-18 and t...,Tissues are exposed to diverse inflammatory ch...,m_MTBLS10416_LC-MS_alternating_hilic_metabolit...,fatty alcohol,CHEBI:24026,584.668913,1.709033e-03,2.962744e-03
4,MTBLS10416,A metabolic switch orchestrated by IL-18 and t...,Tissues are exposed to diverse inflammatory ch...,m_MTBLS10416_LC-MS_alternating_hilic_metabolit...,cation,CHEBI:36916,539.582504,1.851715e-03,2.962744e-03
...,...,...,...,...,...,...,...,...,...
1336,MTBLS959,Subcellular antibiotic visualization reveals a...,"Tuberculosis, caused by the intracellular path...",m_MTBLS959_LC-MS_alternating_reverse-phase_met...,diterpenoid,CHEBI:23849,9.885406,1.783922e-02,1.783922e-02
1337,MTBLS968,The accumulation profiles of terpene metabolit...,Aroma is an important parameter for table grap...,m_MTBLS968_GC-MS_positive__metabolite_profilin...,fatty alcohol,CHEBI:24026,4823.518531,1.853589e-39,7.414355e-39
1338,MTBLS968,The accumulation profiles of terpene metabolit...,Aroma is an important parameter for table grap...,m_MTBLS968_GC-MS_positive__metabolite_profilin...,prenols,CHEBI:26244,32536.825000,1.593974e-09,3.187947e-09
1339,MTBLS968,The accumulation profiles of terpene metabolit...,Aroma is an important parameter for table grap...,m_MTBLS968_GC-MS_positive__metabolite_profilin...,fatty aldehyde,CHEBI:35746,7935.810976,2.904251e-08,3.872335e-08


In [60]:
!mkdir -p output

In [61]:
df.to_csv("output/enrichment_results.csv", index=False)