In [None]:
from crewai import Agent, Task, Crew, LLM
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
import os, tarfile, requests, re
from pathlib import Path
from typing import Type

# -------------------------------
# LLM CONFIG
# -------------------------------
llm = LLM(
    model="ollama/qwen2.5vl:7b",
    base_url="http://localhost:11434",
    is_litellm=True
)

# -------------------------------
# TOOL: Download arXiv source
# -------------------------------
class PaperDownloadInput(BaseModel):
    id: str = Field(..., description="ArXiv paper ID")

class DownloadPaperSourceTool(BaseTool):
    name = "DownloadArxivSource"
    description = "Download and extract arXiv LaTeX source"
    args_schema: Type[BaseModel] = PaperDownloadInput

    def _run(self, id: str) -> str:
        base = "arxiv_sources"
        os.makedirs(base, exist_ok=True)

        tar_path = f"{base}/{id}.tar.gz"
        extract_path = f"{base}/{id}"

        if not os.path.exists(tar_path):
            r = requests.get(f"https://arxiv.org/src/{id}", stream=True)
            r.raise_for_status()
            with open(tar_path, "wb") as f:
                for c in r.iter_content(8192):
                    f.write(c)

        if not os.path.exists(extract_path):
            with tarfile.open(tar_path) as tar:
                tar.extractall(extract_path)

        return extract_path

# -------------------------------
# DETERMINISTIC TeX PARSER
# -------------------------------
def parse_latex_structure(source_dir: str):
    tex_files = Path(source_dir).rglob("*.tex")
    sections, captions = [], []

    for tex in tex_files:
        text = tex.read_text(errors="ignore")
        sections += re.findall(r"\\section\{(.+?)\}", text)
        captions += re.findall(r"\\caption\{(.+?)\}", text)

    return {
        "sections": list(set(sections))[:15],
        "figure_captions": captions[:10]
    }

# -------------------------------
# AGENTS
# -------------------------------

meaning_agent = Agent(
    role="Paper Meaning Extractor",
    goal="Extract factual, structured meaning from paper metadata",
    backstory="You are precise, skeptical, and avoid speculation.",
    llm=llm
)

planner_agent = Agent(
    role="Explanation Planner",
    goal="Plan a human-friendly explanation before writing",
    backstory="You think like a teacher, not a writer.",
    llm=llm
)

writer_agent = Agent(
    role="Technical Explainer",
    goal="Write clear, intuition-first explanations for researchers",
    backstory="You explain ideas without jargon or equations.",
    llm=llm
)

# -------------------------------
# TASKS (TIGHT PROMPTS)
# -------------------------------

meaning_task = Task(
    description="""
You are given:
- Abstract
- HF-style summary
- Section titles
- Figure captions

Your job:
- Extract meaning, NOT prose
- Be factual and conservative
- Never invent missing info

Return STRICT JSON ONLY:
{
  "problem": "...",
  "core_idea": "...",
  "method_overview": "...",
  "key_components": ["..."],
  "assumptions": ["..."],
  "limitations": ["..."]
}
If something is unclear, write "Not explicitly stated".
""",
    agent=meaning_agent,
    expected_output="Structured meaning JSON"
)

planner_task = Task(
    description="""
Given the structured meaning JSON,
create an explanation plan.

Rules:
- Order sections logically
- Prioritize intuition over detail
- Include diagrams only if useful

Return STRICT JSON:
{
  "sections": [
    "Motivation",
    "Core Idea",
    "How It Works",
    "Why It Matters",
    "Limitations"
  ],
  "diagram_sections": ["How It Works"],
  "tone": "intuition-first"
}
""",
    agent=planner_agent,
    expected_output="Explanation plan JSON"
)

writer_task = Task(
    description="""
Write a readable explainer blog USING THE PLAN.

Rules:
- One section at a time
- No equations
- No fake citations
- Refer to figures by caption when helpful
- Use analogies where possible

Output clean Markdown.
""",
    agent=writer_agent,
    expected_output="Markdown explainer blog"
)

# -------------------------------
# CREW
# -------------------------------
crew = Crew(
    agents=[meaning_agent, planner_agent, writer_agent],
    tasks=[meaning_task, planner_task, writer_task],
    verbose=True
)

# -------------------------------
# PIPELINE ENTRY
# -------------------------------
def generate_paper_blog(arxiv_id, abstract, hf_summary):
    source_dir = DownloadPaperSourceTool()._run(arxiv_id)
    structure = parse_latex_structure(source_dir)

    return crew.kickoff(
        inputs={
            "abstract": abstract,
            "hf_summary": hf_summary,
            "sections": structure["sections"],
            "figure_captions": structure["figure_captions"]
        }
    )

# -------------------------------
# USAGE
# -------------------------------
# if __name__ == "__main__":
#     print(
#         generate_paper_blog(
#             arxiv_id="2401.12345",
#             abstract="(abstract here)",
#             hf_summary="(hf summary here)"
#         )
#     )


In [None]:
result = generate_paper_blog("1711.00937")

[36mâ•­â”€[0m[36mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[36m Crew Execution Started [0m[36mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[36mâ”€â•®[0m
[36mâ”‚[0m                                                                              [36mâ”‚[0m
[36mâ”‚[0m  [1;36mCrew Execution Started[0m                                                      [36mâ”‚[0m
[36mâ”‚[0m  [37mName: [0m[36mcrew[0m                                                                  [36mâ”‚[0m
[36mâ”‚[0m  [37mID: [0m[36m557df71b-b088-4d5d-add6-18b635ae596a[0m                                    [36mâ”‚[0m
[36mâ”‚[0m  [37mTool Args: [0m                                                                 [36mâ”‚[0m
[36mâ”‚[0m                                                                              [36mâ”‚[0m
[36mâ”‚[0m                                                                            

  PydanticSerializationUnexpectedValue(Expected 10 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='Thought:... reasoning_content=None), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...reasoning_content=None)), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


[?25l[1;36mðŸš€ Crew: [0m[1;36mcrew[0m
â”œâ”€â”€ [1;32mðŸ“‹ Task: 9f5dd08f-55de-4b32-9f45-963fcc4e7cd7[0m
â”‚   [37mAssigned to: [0m[32mArXiv Paper Downloader[0m
â”‚   [37mStatus: [0m[1;32mâœ… Completed[0m
â”‚   â”œâ”€â”€ [1;32mðŸ”§ [0m[32mUsed Download paper source from arxiv. (20)[0m
â”‚   â””â”€â”€ [1;33mðŸ”§ [0m[33mUsing List files in directory (6)[0m
â””â”€â”€ [1;33mðŸ“‹ Task: 54e01f69-2455-4284-b8d9-767921077603[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1;36mðŸš€ Crew: [0m[1;36mcrew[0m
â”œâ”€â”€ [1;32mðŸ“‹ Task: 9f5dd08f-55de-4b32-9f45-963fcc4e7cd7[0m
â”‚   [37mAssigned to: [0m[32mArXiv Paper Downloader[0m
â”‚   [37mStatus: [0m[1;32mâœ… Completed[0m
â”‚   â”œâ”€â”€ [1;32mðŸ”§ [0m[32mUsed Download paper source from arxiv. (20)[0m
â”‚   â””â”€â”€ [1;32mðŸ”§ [0m[32mUsed List files in directory (6)[0m
â””â”€â”€ [1;33mðŸ“‹ Task: 54e01f69-2455-4284-b8d9-767921077603[0m
    [37mStatus: [0m[2;33mExecuting Task...[

  PydanticSerializationUnexpectedValue(Expected 10 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content='Thought:... reasoning_content=None), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...reasoning_content=None)), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


[32mâ•­â”€[0m[32mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[32m âœ… Agent Final Answer [0m[32mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[32mâ”€â•®[0m
[32mâ”‚[0m                                                                              [32mâ”‚[0m
[32mâ”‚[0m  [37mAgent: [0m[1;92mTechnical Blog Writer[0m                                                [32mâ”‚[0m
[32mâ”‚[0m                                                                              [32mâ”‚[0m
[32mâ”‚[0m  [37mFinal Answer:[0m                                                               [32mâ”‚[0m
[32mâ”‚[0m  [92m# Exploring the Gradient-Based Approach in Machine Learning: A [0m             [32mâ”‚[0m
[32mâ”‚[0m  [92mComprehensive Guide[0m                                                         [32mâ”‚[0m
[32mâ”‚[0m                                                                              [32

[34mâ•­â”€[0m[34mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[34m Tracing Status [0m[34mâ”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€[0m[34mâ”€â•®[0m
[34mâ”‚[0m                                                                              [34mâ”‚[0m
[34mâ”‚[0m  Info: Tracing is disabled.                                                  [34mâ”‚[0m
[34mâ”‚[0m                                                                              [34mâ”‚[0m
[34mâ”‚[0m  To enable tracing, do any one of these:                                     [34mâ”‚[0m
[34mâ”‚[0m  â€¢ Set tracing=True in your Crew/Flow code                                   [34mâ”‚[0m
[34mâ”‚[0m  â€¢ Set CREWAI_TRACING_ENABLED=true in your project's .env file               [34mâ”‚[0m
[34mâ”‚[0m  â€¢ Run: crewai traces enable                                                 [34mâ”‚[0m
[34mâ”‚[0m       

  PydanticSerializationUnexpectedValue(Expected 10 fields but got 6: Expected `Message` - serialized value may not be as expected [input_value=Message(content="Thought:... reasoning_content=None), input_type=Message])
  PydanticSerializationUnexpectedValue(Expected `StreamingChoices` - serialized value may not be as expected [input_value=Choices(finish_reason='st...reasoning_content=None)), input_type=Choices])
  return self.__pydantic_serializer__.to_python(


In [None]:
import crewai_tools

In [None]:
!pip install crewai_tools


In [24]:
result

CrewOutput(raw='Your final answer must be the great and the most complete as possible, it must be outcome described.\n\n### Complete Blog Post with Embedded Figures and Explanations\n\n#### Title: Neural Discrete Representation Learning\n\n#### Introduction\n\nNeural Discrete Representation Learning (NDRL) is a framework that aims to learn discrete representations in neural networks. This approach is particularly useful in scenarios where the data is inherently discrete, such as in natural language processing, computer vision, and recommendation systems. The core idea behind NDRL is to learn a mapping from continuous inputs to discrete representations, which can then be used for tasks like classification, clustering, or generation.\n\n#### Related Work\n\nPrevious work in this area includes methods like Softmax, which is a common technique for converting continuous distributions into discrete representations. However, Softmax can suffer from issues such as vanishing gradients and the n

In [44]:
result.raw

"# Exploring the Gradient-Based Approach in Machine Learning: A Comprehensive Guide\n\n## Introduction\n\nIn the vast landscape of machine learning, the development of algorithms that can learn from data has been a significant focus. One such approach that has gained considerable attention is the gradient-based method. This method, which is the subject of the paper referenced by the arxiv/1711.00937, offers a powerful framework for optimizing complex models. This blog post aims to delve into the intricacies of gradient-based methods, their applications, and the insights provided by the research paper.\n\n## What is a Gradient-Based Method?\n\nGradient-based methods are optimization algorithms that use the gradient of a function to find its minimum or maximum. In the context of machine learning, these methods are used to minimize the loss function, which quantifies the difference between the model's predictions and the actual data. The gradient of the loss function with respect to the m