<a href="https://colab.research.google.com/github/labanya-1/-Smart-Assistant-for-Research-Summarization/blob/main/llm_dspy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!uv pip install dspy-ai


[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m75 packages[0m [2min 1.10s[0m[0m
[2K[2mPrepared [1m13 packages[0m [2min 567ms[0m[0m
[2mUninstalled [1m1 package[0m [2min 9ms[0m[0m
[2K[2mInstalled [1m13 packages[0m [2min 149ms[0m[0m
 [32m+[39m [1masyncer[0m[2m==0.0.8[0m
 [32m+[39m [1mbackoff[0m[2m==2.2.1[0m
 [32m+[39m [1mcolorlog[0m[2m==6.10.1[0m
 [32m+[39m [1mdiskcache[0m[2m==5.6.3[0m
 [32m+[39m [1mdspy[0m[2m==3.0.4[0m
 [32m+[39m [1mdspy-ai[0m[2m==3.0.4[0m
 [32m+[39m [1mfastuuid[0m[2m==0.14.0[0m
 [32m+[39m [1mgepa[0m[2m==0.0.17[0m
 [31m-[39m [1mgrpcio[0m[2m==1.76.0[0m
 [32m+[39m [1mgrpcio[0m[2m==1.67.1[0m
 [32m+[39m [1mjson-repair[0m[2m==0.54.3[0m
 [32m+[39m [1mlitellm[0m[2m==1.80.10[0m
 [32m+[39m [1mmagicattr[0m[2m==0.1.6[0m
 [32m+[39m [1moptuna[0m[2m==4.6.0[0m


In [2]:
import json
import dspy
import copy
from typing import List, Optional
from typing import Literal, Dict, Union
from dspy.adapters import XMLAdapter

In [3]:
API_KEY="ak_23N09U4xZ4U04m46AW13S8rS1533S"
main_lm = dspy.LM("openai/LongCat-Flash-Chat", api_key=API_KEY,api_base="https://api.longcat.chat/openai/v1")

dspy.settings.configure(lm=main_lm,adapter=dspy.XMLAdapter())

In [4]:
# ---------------------------------------------------------
# 1.  ENTITY + ATTRIBUTE EXTRACTION
# ---------------------------------------------------------
# import dspy
from typing import List, Dict, Tuple
from pydantic import BaseModel, Field

class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Drug, Disease, Symptom, etc.)")

class ExtractEntities(dspy.Signature):
    """From the paragraph extract all relevant entities and their semantic attribute types."""
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

extractor = dspy.Predict(ExtractEntities)

# ---------------------------------------------------------
# 2.  DEDUPLICATOR (recursive batching + confidence loop)
# ---------------------------------------------------------
class DeduplicateEntities(dspy.Signature):
    """Given a list of (entity, attr_type) decide which ones are duplicates.
    Return a deduplicated list and a confidence that the remaining items are ALL distinct."""
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(
        desc="confidence (0-1) that every item in deduplicated is semantically distinct"
    )

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(
    items: List[EntityWithAttr],
    *,
    batch_size: int = 10,
    target_confidence: float = 0.9,
) -> List[EntityWithAttr]:
    """
    Recursively deduplicate using the LM.
    Works by:
      1. splitting into batches of `batch_size`
      2. for each batch asking the LM for duplicates + confidence
      3. rerunning the batch until confidence >= target_confidence
      4. concatenating results from all batches
    """
    if not items:
        return []

    # helper to process one batch
    def _process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        while True:
            pred = dedup_predictor(items=batch)
            if pred.confidence >= target_confidence:
                return pred.deduplicated
            # otherwise loop again with same batch

    # split into batches and process
    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i : i + batch_size]
        results.extend(_process_batch(batch))
    return results


In [5]:

# ---------------------------------------------------------
# 4.  RELATION EXTRACTION
# ---------------------------------------------------------
class Relation(BaseModel):
    subj: str = Field(description="subject entity (exact string as in deduplicated list)")
    pred: str = Field(description="short predicate / relation phrase")
    obj:  str = Field(description="object entity (exact string as in deduplicated list)")

class ExtractRelations(dspy.Signature):
    """Given the original paragraph and a list of unique entities, extract all factual (subject, predicate, object) triples that are explicitly stated or clearly implied."""
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities:  List[str] = dspy.InputField(desc="list of deduplicated entity strings")
    relations: List[Relation] = dspy.OutputField(desc="list of subject-predicate-object triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)

# ---------------------------------------------------------
# 5.  MERMAID SERIALISER  (revised)
# ---------------------------------------------------------
def triples_to_mermaid(
    triples: list[Relation],
    entity_list: list[str],
    max_label_len: int = 40
) -> str:
    """
    Convert triples to a VALID Mermaid flowchart LR diagram.
    """
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]

    def _make_id(s: str) -> str:
        # Create valid Mermaid node ID (no spaces or special chars)
        return s.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")

    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()

        if obj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        elif subj_norm in entity_set:
            src, dst, lbl = t.obj, t.subj, t.pred
        else:
            continue

        # Sanitize label
        lbl = lbl.strip()
        if len(lbl) > max_label_len:
            lbl = lbl[:max_label_len - 3] + "..."

        # Use valid IDs with display labels
        src_id, dst_id = _make_id(src), _make_id(dst)
        lines.append(f'    {src_id}["{src}"] -->|{lbl}| {dst_id}["{dst}"]')

    return "\n".join(lines)

# ---------------------------------------------------------
# 6.  END-TO-END RUN  (FIXED)
# ---------------------------------------------------------
if __name__ == "__main__":
    paragraph = """
    Effects of organic amendments on productivity, nitrogen uptake, and protein content in pea--barley intercrops compared to the sole crops
    agricultural
    barleyintercropping systemsorganic farmingpeasustainable agriculture

    +1
    Saad Mir,Vaibhav Chaudhary,Nicolò Maria Villa,Bhim Ghaley
    Abstract
    Cereal-legume intercropping and organic amendments are promising strategies to boost crop productivity, land use efficiency, and sustainability. However, their performance varies depending on pedo-climatic zones and crop types. In Denmark, pea-barley intercrop is commonly practiced for seed harvest and fodder production. Therefore, the objective of this study was to investigate the effects of organic amendments on productivity, nitrogen (N) uptake, and barley grain protein in pea-barley intercrops (PB IC) compared to sole crops. A field trial was conducted using a strip-plot design with three cropping systems—PB IC, pea sole (PS), and barley sole (BS) with six organic amendment treatments: control (T0), biochar (T1), compost (T2), insect frass (T3), vermicompost (T4), and pelletized frass (T5). Averaged across organic amendments, PB IC increased aboveground dry biomass (AGDB) by 18–57% and grain yield by 12–135% compared to sole crops. Grain N-uptake under PB IC increased by 66–94%, compared to sole crops. PB IC suppressed weed biomass by 83% relative to PS. Averaged across cropping systems, T5 increased grain yield by 105%, N-uptake in straw by 49%, and in grains by 101%, compared to T0. Land equivalent ratio (LER) ranged from 1.15-2.47 across treatments, indicating improved land use efficiency. Barley protein content was consistently higher in PB IC than in BS. PB IC combined with organic amendments—particularly pelletized frass, significantly increased crop yield, land use efficiency, N-uptake, and grain quality. This field study provides robust evidence of the multiple benefits of integrating pea-barley intercropping with organic amendments for sustainable intensification.
    """

    # --- 3a. extract entities (from Section 1)
    extracted = extractor(paragraph=paragraph)
    print("Extracted entities:")
    for e in extracted.entities:
        print(" -", e.entity, "=>", e.attr_type)

    # --- 3b. deduplicate (from Section 1)
    unique = deduplicate_with_lm(extracted.entities, batch_size=10, target_confidence=0.9)
    print("\nDeduplicated entities:")
    for e in unique:
        print(" -", e.entity, "=>", e.attr_type)

    # Prepare entity strings for relation extraction
    entity_strings = [e.entity for e in unique]

    # --- 4. relation extraction
    rel_out = rel_predictor(paragraph=paragraph, entities=entity_strings)
    print("\nExtracted relations:")
    for r in rel_out.relations:
        print(" -", r.subj, "--", r.pred, "-->", r.obj)

    # --- 5. generate Mermaid diagram
    mermaid_code = triples_to_mermaid(
        rel_out.relations,
        entity_strings
    )
    print("\nValid Mermaid diagram:\n")
    print("mermaid")
    print(mermaid_code)
    print("")

Extracted entities:
 - organic amendments => AgriculturalPractice
 - productivity => AgriculturalMetric
 - nitrogen uptake => NutrientUptake
 - protein content => NutritionalComponent
 - pea-barley intercrops => CropSystem
 - sole crops => CropSystem
 - cereal-legume intercropping => AgriculturalPractice
 - organic farming => AgriculturalPractice
 - sustainable agriculture => AgriculturalPractice
 - Denmark => Location
 - seed harvest => AgriculturalActivity
 - fodder production => AgriculturalActivity
 - field trial => ResearchMethod
 - strip-plot design => ExperimentalDesign
 - cropping systems => AgriculturalSystem
 - pea sole => CropSystem
 - barley sole => CropSystem
 - control => Treatment
 - biochar => OrganicAmendment
 - compost => OrganicAmendment
 - insect frass => OrganicAmendment
 - vermicompost => OrganicAmendment
 - pelletized frass => OrganicAmendment
 - aboveground dry biomass => BiomassMeasure
 - grain yield => YieldMeasure
 - grain N-uptake => NutrientUptake
 - weed b

In [6]:
def run_dspy_pipeline(text: str):
    extracted = extractor(paragraph=text)

    unique_entities = deduplicate_with_lm(
        extracted.entities,
        batch_size=10,
        target_confidence=0.9
    )

    entity_strings = [e.entity for e in unique_entities]

    relations = rel_predictor(
        paragraph=text,
        entities=entity_strings
    ).relations

    return unique_entities, relations


In [7]:
!pip install beautifulsoup4 requests




In [8]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url: str, max_chars: int = 12000) -> str:
    html = requests.get(url, timeout=20).text
    soup = BeautifulSoup(html, "html.parser")

    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = " ".join(soup.stripped_strings)
    return text[:max_chars]


In [9]:
test_url = "https://en.wikipedia.org/wiki/Sustainable_agriculture"
text = extract_text_from_url(test_url)
print(text[:1000])


Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See also https://phabricator.wikimedia.org/T400119.


In [10]:
entities, relations = run_dspy_pipeline(text)


In [11]:
entity_strings = [e.entity for e in entities]

mermaid_code = triples_to_mermaid(
    relations,
    entity_strings
)

print(mermaid_code)


flowchart LR
    user_agent["user-agent"] -->|respect| robot_policy["robot policy"]
    https://w.wiki/4wJS["https://w.wiki/4wJS"] -->|hosts| robot_policy["robot policy"]
    https://phabricator.wikimedia.org/T400119["https://phabricator.wikimedia.org/T400119"] -->|see also| robot_policy["robot policy"]


In [12]:
def save_mermaid(code: str, index: int):
    filename = f"mermaid_{index}.md"
    with open(filename, "w") as f:
        f.write("```mermaid\n")
        f.write(code)
        f.write("\n```")
    return filename


In [13]:
save_mermaid(mermaid_code, 1)


'mermaid_1.md'

In [14]:
def entities_to_csv_rows(entities, url):
    rows = []
    for e in entities:
        rows.append({
            "link": url,
            "tag": e.entity,
            "tag_type": e.attr_type
        })
    return rows


In [15]:
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]


In [16]:
all_csv_rows = []

for i, url in enumerate(URLS, start=1):
    print(f"Processing {i}/10")

    text = extract_text_from_url(url)

    entities, relations = run_dspy_pipeline(text)

    entity_strings = [e.entity for e in entities]

    mermaid = triples_to_mermaid(relations, entity_strings)

    save_mermaid(mermaid, i)

    all_csv_rows.extend(entities_to_csv_rows(entities, url))


Processing 1/10
Processing 2/10
Processing 3/10
Processing 4/10
Processing 5/10
Processing 6/10
Processing 7/10
Processing 8/10
Processing 9/10
Processing 10/10


In [17]:
unique = {}
for r in all_csv_rows:
    key = (r["link"], r["tag"])
    unique[key] = r

final_rows = list(unique.values())


In [18]:
import csv

with open("tags.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=["link", "tag", "tag_type"]
    )
    writer.writeheader()
    writer.writerows(final_rows)
