In [3]:
pip install beautifulsoup4

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Using cached beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Using cached soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [beautifulsoup4]
[1A[2KSuccessfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import subprocess

In [5]:
BASE_URL = "https://python.langchain.com"
GUIDES_URL = f"{BASE_URL}/docs/how_to/"

In [6]:
# Step 1: Scrape all guide URLs
def get_guide_links():
    response = requests.get(GUIDES_URL)
    soup = BeautifulSoup(response.content, "html.parser")
    links = []

    for a in soup.find_all("a", href=True):
        if a.text.strip().lower().startswith("how to"):
            full_url = urljoin(BASE_URL, a['href'])
            links.append((a.text.strip(), full_url))

    return links

In [15]:
# Step 2: Scrape one guide and extract title, content sections, code
def scrape_guide(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    title = soup.find("h1").text.strip() if soup.find("h1") else "Untitled"
    sections = []
    current_heading = None
    current_content = []
    current_code_blocks = []

    for tag in soup.find_all(["h2", "p", "pre"]):
        if tag.name == "h2":
            if current_heading:
                sections.append({
                    "heading": current_heading,
                    "text": " ".join(current_content).strip(),
                    "code_blocks": current_code_blocks
                })
            current_heading = tag.text.strip()
            current_content = []
            current_code_blocks = []
        elif tag.name == "p":
            current_content.append(tag.text.strip())
        elif tag.name == "pre":
            current_code_blocks.append(tag.get_text("\n", strip=False))

    # Add the final section
    if current_heading:
        # Use DeepSeek LLM via OLLAMA to convert section into Q&A format

        section_text = " ".join(current_content).strip()
        code_blocks_text = "\n\n".join(current_code_blocks)
        prompt = f"""You are an expert in LangChain. Given the following section from the documentation, generate a question and a detailed answer that would help someone learn LangChain. Include code examples if present.

    Section Heading: {current_heading}
    Section Text: {section_text}
    Code Blocks:
    {code_blocks_text}

    Format your response as:
    Question: <question>
    Answer: <answer>
    """

        # Call OLLAMA DeepSeek LLM locally
        ollama_cmd = [
            "ollama", "run", "gemma3:4b",
            "--prompt", prompt
        ]
        try:
            result = subprocess.run(
            ollama_cmd,
            capture_output=True,
            text=True,
            timeout=60
            )
            output = result.stdout.strip()
            # Parse output into question and answer
            if "Question:" in output and "Answer:" in output:
                q = output.split("Question:", 1)[1].split("Answer:", 1)[0].strip()
                a = output.split("Answer:", 1)[1].strip()
            else:
                q = f"What is covered in the section '{current_heading}'?"
                a = section_text + ("\n\n" + code_blocks_text if code_blocks_text else "")
        except Exception as e:
            q = f"What is covered in the section '{current_heading}'?"
            a = section_text + ("\n\n" + code_blocks_text if code_blocks_text else "")

        sections.append({
            "heading": current_heading,
            "question": q,
            "answer": a
        })

    return {
        "title": title,
        "url": url,
        "sections": sections
    }

In [8]:
all_links = get_guide_links()

In [10]:
all_links[0:5]

[('How to use tools in a chain',
  'https://python.langchain.com/docs/how_to/tools_chain/'),
 ('How to use a vectorstore as a retriever',
  'https://python.langchain.com/docs/how_to/vectorstore_retriever/'),
 ('How to add memory to chatbots',
  'https://python.langchain.com/docs/how_to/chatbots_memory/'),
 ('How to use example selectors',
  'https://python.langchain.com/docs/how_to/example_selectors/'),
 ('How to add a semantic layer over graph database',
  'https://python.langchain.com/docs/how_to/graph_semantic/')]

In [11]:
temp_link = all_links[0][1]  # Just take the first link for testing
temp_link

'https://python.langchain.com/docs/how_to/tools_chain/'

In [16]:
data = scrape_guide(temp_link)

In [17]:
data

{'title': 'How to use tools in a chain',
 'url': 'https://python.langchain.com/docs/how_to/tools_chain/',
 'sections': [{'heading': 'Setup\u200b',
   'text': "We'll need to install the following packages for this guide: If you'd like to trace your runs in LangSmith uncomment and set the following environment variables:",
   'code_blocks': ['%\npip install \n-\n-\nupgrade \n-\n-\nquiet langchain',
    'import\n getpass\nimport\n os\n# os.environ["LANGSMITH_TRACING"] = "true"\n# os.environ["LANGSMITH_API_KEY"] = getpass.getpass()']},
  {'heading': 'Create a tool\u200b',
   'text': 'First, we need to create a tool to call. For this example, we will create a custom tool from a function. For more information on creating custom tools, please see this guide.',
   'code_blocks': ['from\n langchain_core\n.\ntools \nimport\n tool\n@tool\ndef\n \nmultiply\n(\nfirst_int\n:\n \nint\n,\n second_int\n:\n \nint\n)\n \n-\n>\n \nint\n:\n    \n"""Multiply two integers together."""\n    \nreturn\n first_i

In [19]:
print(data["sections"][-2]["code_blocks"])

['pip install -qU "langchain[openai]"', 'import\n getpass\nimport\n os\nif\n \nnot\n os\n.\nenviron\n.\nget\n(\n"OPENAI_API_KEY"\n)\n:\n  os\n.\nenviron\n[\n"OPENAI_API_KEY"\n]\n \n=\n getpass\n.\ngetpass\n(\n"Enter API key for OpenAI: "\n)\nfrom\n langchain\n.\nchat_models \nimport\n init_chat_model\nllm \n=\n init_chat_model\n(\n"gpt-4o-mini"\n,\n model_provider\n=\n"openai"\n)', 'llm_with_tools \n=\n llm\n.\nbind_tools\n(\n[\nmultiply\n]\n)', 'msg \n=\n llm_with_tools\n.\ninvoke\n(\n"whats 5 times forty two"\n)\nmsg\n.\ntool_calls', "[{'name': 'multiply',\n  'args': {'first_int': 5, 'second_int': 42},\n  'id': 'call_8QIg4QVFVAEeC1orWAgB2036',\n  'type': 'tool_call'}]", 'from\n operator \nimport\n itemgetter\nchain \n=\n llm_with_tools \n|\n \n(\nlambda\n x\n:\n x\n.\ntool_calls\n[\n0\n]\n[\n"args"\n]\n)\n \n|\n multiply\nchain\n.\ninvoke\n(\n"What\'s four times 23"\n)', '92']


In [7]:
# Step 3: Run full scrape and save JSONL for fine-tuning
def scrape_all_guides_and_save(filename="langchain_howto.jsonl"):
    all_links = get_guide_links()
    with open(filename, "w", encoding="utf-8") as f:
        for name, url in all_links:
            print(f"Scraping: {name}")
            data = scrape_guide(url)
            f.write(json.dumps(data, ensure_ascii=False) + "\n")

scrape_all_guides_and_save()


Scraping: How to use tools in a chain
Scraping: How to use a vectorstore as a retriever
Scraping: How to add memory to chatbots
Scraping: How to use example selectors
Scraping: How to add a semantic layer over graph database
Scraping: How to invoke runnables in parallel
Scraping: How to stream chat model responses
Scraping: How to add default invocation args to a Runnable


KeyboardInterrupt: 