In [140]:
pip install -U langchain langchain-core langchain-community

Note: you may need to restart the kernel to use updated packages.


In [141]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_tavily import TavilySearch
from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver
from typing import TypedDict, List, Annotated, Literal
from uuid import uuid4
import operator
import warnings
import asyncio
import os
import time
from langgraph.checkpoint.sqlite import SqliteSaver
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage
from pydantic import BaseModel
from typing import List, Dict
memory = SqliteSaver.from_conn_string(":memory:")
warnings.filterwarnings("ignore", category=UserWarning)

retriever = None


In [163]:
# ============================================
# SETUP RAG
# ============================================

def setup_knowledge_base(folder_path):
    """
    Sets up a Persistent Vector Database (Chroma).
    - If './chroma_db' exists: Loads the DB from disk (Fast).
    - If not: Reads PDFs, embeds them, and saves to disk (Slow, one-time).
    """
    persist_dir = "./chroma_db"
    embedding_model = OllamaEmbeddings(model="nomic-embed-text")
    collection_name = "seminar_papers_collection"

    # 1. CHECK IF DB EXISTS
    if os.path.exists(persist_dir) and os.listdir(persist_dir):
        print(f"üìÇ Found existing Vector DB in '{persist_dir}'. Loading...")
        
        vectorstore = Chroma(
            persist_directory=persist_dir,
            embedding_function=embedding_model,
            collection_name=collection_name
        )
        
        # Return retriever directly
        print("‚úÖ Vector DB Loaded successfully.")
        return vectorstore.as_retriever(search_kwargs={"k": 3})

    # 2. IF NOT EXISTS, CREATE NEW
    print(f"üìÇ No existing DB found. Creating new one from '{folder_path}'...")
    
    if not os.path.exists(folder_path):
        print(f"‚ö†Ô∏è Error: The folder '{folder_path}' does not exist.")
        return None

    # Load PDFs
    loader = PyPDFDirectoryLoader(folder_path)
    docs = loader.load()
    
    if not docs:
        print("‚ö†Ô∏è No PDFs found in the folder.")
        return None

    print(f"   - Loaded {len(docs)} pages.")

    # Split Text
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=200
    )
    splits = splitter.split_documents(docs)
    print(f"   - Split into {len(splits)} chunks.")

    # Create and Save Vector Store
    # Passing 'persist_directory' automatically saves it to disk
    vectorstore = Chroma.from_documents(
        documents=splits, 
        embedding=embedding_model, 
        collection_name=collection_name,
        persist_directory=persist_dir
    )
    
    print(f"‚úÖ Vector DB created and saved to '{persist_dir}'")
    return vectorstore.as_retriever(search_kwargs={"k": 3})

# Wrapper function to call from main
async def ingest_local_papers():
    global retriever
    # This calls the function above
    retriever = setup_knowledge_base("./papers")

In [143]:
# ============================================
# STATE DEFINITION
# ============================================

class SeminarState(TypedDict):
    # INPUT
    topic: str
    
    # PLANNING PHASE
    outline: List[str]  # Simple 5-section outline
    current_section_index: int
    
    # RESEARCH PHASE (for current section)
    current_section: str
    key_points: List[str]  # What to discuss in each section
    rag_context: str
    web_context: str
    
    # WRITING PHASE
    draft: str  # Draft content for each section
    section_drafts: List[str]

    # Final outputs
    final_paper: str
    
    # VALIDATION
    is_valid: bool
    feedback: str
    revision_count: int


class Keypoints(BaseModel):
    Keypoints: List[str]

class Plan(BaseModel):
    Plan: List[str]

In [144]:
plan_outline_prompt = """You are an expert seminar paper planner. Given the seminar topic, create a simple outline with 5-6 sections that covers the key aspects of the topic.
Provide only the section titles in a numbered list format."""

In [145]:
research_paln_prompt = """Your are a researcher charged with providing information that can \
be used when writing the following section of a seminar report. You are planning what to write this specific section.

Main Topic: {topic}
section: {section}

List 3-5 key points that MUST be covered in this section for the seminar report.

Return as a Python list of key points as values.
"""

In [146]:
writing_prompt = """You are an academic research writer.Your task is to WRITE the academic section titled "{section}" for the seminar report.
Utilize all the information below as needed: 

KEY POINTS TO COVER:
{key_points}
RELEVANT CONTEXT:
{CONTEXT}

INSTRUCTIONS:
1. Write the section in detail, covering all key points thoroughly as specified as academic report.
2. Stay focused on the MAIN TOPIC
3. Cover all key points listed above
4. Maintain academic tone
5. Use information from relevant context
6. No bullet points - write in paragraphs 

Write the section now:
"""

In [147]:
research_critic_prompt = """You are a research reviewer to Review this section content of a seminar report. Generate critique and recommendations for the user's submission. \

Main Topic: {topic}
Section Title: {section}

Check:
1. Does it stay focused on the main topic?
2. Are all key points covered?
3. Is the language academic and clear?

Respond EXACTLY:
- If good: "APPROVE"
- If needs work: "REVISE: [specific issue]"
"""

In [148]:
full_paper_prompt = """
You are a senior academic editor.

TASK:
You are given independently written sections of a seminar pape report. for the topic: "{topic}" the outline for "{outline}".

GOALS:
1. write the FULL paper with section headings, including an abstract and keywords.
2. Ensure smooth transitions between sections
3. Remove redundancy
4. Ensure consistent terminology
5. Maintain academic tone
    - Add an abstract (150 words)
    - Add keywords
6. Keep all technical content
7. Insert placeholder citations like [1], [2] where appropriate

SECTIONS CONTENT:
{combined_sections}

Write the FULL paper with section headings, including an abstract and keywords."""

In [None]:
# ============================================
# MODELS
# ============================================

llm = ChatOllama(model="qwen3:8b", temperature=0.8, num_ctx=32000) 
pcllm = ChatOllama(model="deepseek-r1", temperature=0.5)  
web_search = TavilySearch(tavily_api_key="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",max_results=2)  # Reduced for speed


In [150]:
# ============================================
# NODES - HUMAN-LIKE WRITING PROCESS
# ============================================

async def plan_outline(state: SeminarState):
    """
    Step 1: Create SIMPLE 5-section outline
    Mimics: Human deciding report structure
    """
    print("\n" + "="*60)
    print("üìã STEP 1: PLANNING OUTLINE")
    print("="*60)
    response = await pcllm.with_structured_output(Plan).ainvoke([
        SystemMessage(content=plan_outline_prompt), 
        HumanMessage(content=state['topic'])
    ])
    
    outline = response.Plan
    print(f"\n‚úì Created outline with {len(outline)} sections:")
    for i, section in enumerate(outline, 1):
        print(f"  {section}")
    
    return {
        "outline": outline,
        "current_section_index": 0,
        "final_report": []
    }

In [151]:
def should_continue(state: SeminarState) -> Literal["continue", "done"]:
    if state["current_section_index"] >= len(state["outline"]):
        return "done"
    return "continue"


In [152]:
async def brainstorm_section(state: SeminarState):
    idx = state["current_section_index"]
    section = state["outline"][idx]
    state["current_section"] = section
    print(section)
    print("\n" + "="*60)
    print(f"üí° STEP 2: BRAINSTORMING ")
    print("="*60)
    key_points = await llm.with_structured_output(Keypoints).ainvoke([
        SystemMessage(content=research_paln_prompt.format(
        topic=state['topic'],
        section=section)),
        HumanMessage(content=state['topic'])
    ])
    
    key_points = key_points.Keypoints
    
    print(f"\n‚úì Identified {len(key_points)} key points:")
    
    return {
        "key_points": key_points,
        "current_section": section
    }


In [153]:
async def research_section(state: SeminarState):
    print("\n" + "="*60)
    print(f"üîç STEP 3: GATHERING INFORMATION")
    print("="*60)
    section = state["current_section"]
    points = state["key_points"]

    web_text = ""
    rag_text = ""
    
    for p in points:
        
        # Research query combines topic + section + key points
        query = f"For this topic: {state['topic']} find the information for{' '.join(p)}"
        
        print(f"\n‚Üí Web search: {query[:80]}...")
        try:
            response = await web_search.ainvoke(query)
            for r in response['results']:
                web_text = web_text + r['content']
    
        except Exception as e:
            print(f"  ‚ö†Ô∏è Web search failed: {e}")
            web_text = "No web results available"
        
        print(f"\n‚Üí RAG retrieval: {section}...")
        try:
            docs = await retriever.ainvoke(query)
            rag_text = rag_text + ("\n\n".join([
                f"Document excerpt {i+1}:\n{d.page_content[:500]}"
                for i, d in enumerate(docs[:3])
            ]))
        except Exception as e:
            print(f"  ‚ö†Ô∏è RAG retrieval failed: {e}")
            rag_text = "No local documents available"

    print(f"\n‚úì Gathered {len(web_text)} chars from web, {len(rag_text)} chars from papers")

    return {
        "web_context": web_text,
        "rag_context": rag_text
    }
    


In [154]:
async def write_section(state: SeminarState):
    """
    Step 4: Write the actual content
    Mimics: Human writing with gathered information
    """
    print("\n" + "="*60)
    print(f"‚úçÔ∏è  STEP 4: WRITING SECTION")
    print("="*60)
    
    user_message = HumanMessage(
        content=f"MAIN TOPIC: {state['topic']}\n\nHere is my plan:\n\n{state['outline']}")
    messages = [
        SystemMessage(
            content=writing_prompt.format(section=state['current_section'], key_points=state['key_points'], CONTEXT=state.get('web_context', '') + "\n" + state.get('rag_context', ''))
        ),
        user_message
        ]
    response = await llm.ainvoke(messages)
    print(f"the section {state['current_section_index']+1}. {state['current_section']}: is completed")
    draft = f"## Section {state['current_section_index']+1}. {state['current_section']}: \n\n{response.content.strip()}"
    
    word_count = len(draft.split())
    print(f"\n‚úì Generated draft: {word_count} words")
    
    return {
        "draft": draft, 
        "revision_number": state.get("revision_number", 0) + 1
    }

In [155]:
def route_after_validation(state: SeminarState):
    if state["is_valid"]:
        return "save_and_next"
    if state["revision_count"] >= 2:
        return "save_and_next"
    return "write_section"


In [156]:
def save_and_next(state: SeminarState):
    drafts = state.get("section_drafts", [])
    drafts.append(state["draft"])

    return {
        "section_drafts": drafts,
        "current_section_index": state["current_section_index"] + 1
    }


In [157]:
async def validate_section(state: SeminarState):
    """
    Step 5: Check if section is good
    Mimics: Human self-editing
    """
    print("\n" + "="*60)
    print(f"üîç STEP 5: VALIDATION")
    print("="*60)
    topic = state['topic']
    draft = state['draft']
    messages = [
        SystemMessage(content=research_critic_prompt.format(topic=topic,section=state['current_section'])), 
        HumanMessage(content=draft)
    ]
    
    response = await pcllm.ainvoke(messages)
    feedback = response.content.strip()
    
    is_valid = "approve" in feedback.lower()
    
    if is_valid:
        print(f"‚úì Section APPROVED")
    else:
        print(f"‚úó Needs revision: {feedback[:100]}")
    
    return {
        "is_valid": is_valid,
        "feedback": feedback,
        "revision_count": state["revision_count"] + 1
    }

In [158]:
async def synthesize_full_report(state: SeminarState):
    print("\nüß† GLOBAL SYNTHESIS: CONNECTING ALL SECTIONS")

    combined_sections = "\n\n".join(state["section_drafts"])

    prompt = full_paper_prompt.format(
        topic=state['topic'],
        outline=", ".join(state['outline']),
        combined_sections=combined_sections
    )
    response = await llm.ainvoke(prompt)
    return {
        "final_paper": response.content
    }


In [159]:
# ============================================
# BUILD GRAPH
# ============================================

def build_agent():
    """
    Build the workflow graph
    
    Flow:
    plan ‚Üí [for each section: brainstorm ‚Üí research ‚Üí write ‚Üí validate ‚Üí save] ‚Üí end
    """
    workflow = StateGraph(SeminarState)

    workflow.add_node("plan", plan_outline)
    workflow.add_node("brainstorm", brainstorm_section)
    workflow.add_node("research", research_section)
    workflow.add_node("write", write_section)
    workflow.add_node("validate", validate_section)
    workflow.add_node("save_and_next", save_and_next)
    workflow.add_node("synthesize", synthesize_full_report)

    workflow.set_entry_point("plan")

    workflow.add_edge("plan", "brainstorm")
    workflow.add_edge("brainstorm", "research")
    workflow.add_edge("research", "write")
    workflow.add_edge("write", "validate")

    workflow.add_conditional_edges(
        "validate",
        route_after_validation,
        {
            "write_section": "write",
            "save_and_next": "save_and_next"
        }
    )
    workflow.add_conditional_edges(
    "save_and_next",
    should_continue,
    {
        "continue": "brainstorm",
        "done": "synthesize"
    }
)

    workflow.add_edge("synthesize", END)

    # Compile with in-memory checkpointing
    memory = MemorySaver()
    app = workflow.compile(checkpointer=memory)
    
    return app, memory


In [162]:
# ============================================
# MAIN EXECUTION
# ============================================

async def main():
    print("\n" + "="*60)
    print("üöÄ SEMINAR REPORT GENERATOR")
    print("="*60)
    global retriever
    print("\nüîß Ingesting local research papers...")
    await ingest_local_papers()

    print("\nüß† Building multi-agent workflow...")
    app, memory = build_agent()

    topic = input("\nEnter Seminar Topic: ").strip()

    if not topic:
        topic = "Large Language Models for Text Summarization"
        print(f"Using default topic: {topic}")

    initial_state = {
        "topic": topic,
        "outline": [],
        "current_section_index": 0,
        "current_section": "",
        "key_points": [],
        "rag_context": "",
        "web_context": "",
        "draft": "",
        "section_drafts": [],
        "final_paper": "",
        "is_valid": False,
        "feedback": "",
        "revision_count": 0
    }

    config = {
    "configurable": {"thread_id": str(uuid4())},
    "recursion_limit": 100
    }


    print("\nüöÄ Starting report generation...\n")

    start_time = time.time()

    async for s in app.astream(initial_state, config):
        print(s)

    elapsed = time.time() - start_time
    print(f"\n‚è≥ Total time: {elapsed/60:.2f} minutes")

    # --- SAVE FILE ---
    final_state = app.get_state(config).values
    full_report = final_state['final_paper']

    filename = "Final_Seminar_Report.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# Seminar Report: {topic}\n\n")
        f.write(full_report)
        
    print(f"\n‚úÖ Report generated successfully: {filename}")

In [161]:
if __name__ == "__main__":
    # asyncio.run(main())
    await main()



üöÄ SEMINAR REPORT GENERATOR

üîß Ingesting local research papers...
üìÇ Loading PDFs from ./papers...

üß† Building multi-agent workflow...

üöÄ Starting report generation...


üìã STEP 1: PLANNING OUTLINE

‚úì Created outline with 7 sections:
  Introduction to Text Generation with LLMs
  Understanding Large Language Models (LLMs)
  Training and Fine-Tuning for Text Generation
  Mechanisms of Text Generation (e.g., Decoding Strategies)
  Applications and Use Cases of Text Generation
  Ethical Considerations and Challenges in Text Generation
  Conclusion: Future Directions and Implications
{'plan': {'outline': ['Introduction to Text Generation with LLMs', 'Understanding Large Language Models (LLMs)', 'Training and Fine-Tuning for Text Generation', 'Mechanisms of Text Generation (e.g., Decoding Strategies)', 'Applications and Use Cases of Text Generation', 'Ethical Considerations and Challenges in Text Generation', 'Conclusion: Future Directions and Implications'], 'current_secti