# Web scraping of the pytorch document

## Importing libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests
from urllib.parse import urljoin
import json
# import pandas as pd

In [2]:
base_url="https://pytorch.org/docs/stable/"
url="nn.html"

response=requests.get(urljoin(base_url,url))

html=response.text

In [3]:
soup=bs(html,'html.parser')

In [4]:
def extract_doc_info(soup, base_url="https://pytorch.org/docs/stable/"):
    result = {}

    try:
        # Extract title from <span class="pre">
        title = soup.find("span", class_="pre").get_text(strip=True)
        result["title"] = title
    except AttributeError:
        result["title"] = None

    try:
        # Extract relative link from <a>
        a_tag = soup.find("a", class_="reference internal")
        relative_link = a_tag.get("href")
        full_url = urljoin(base_url, relative_link)
    except Exception:
        result["page_text"] = ""
        result["page_links"] = []
        return result
    try:
        # Request the page
        response = requests.get(full_url)
        page_soup = bs(response.text, "html.parser")

        # Extract page text
        content_div = page_soup.find("div", class_="pytorch-content-left")
        if not content_div:
            raise ValueError("Content div not found")

        link_counter = 1
        link_refs = []

        def clean_text(element, level=0):
            nonlocal link_counter, link_refs
            parts = []
            for child in element.children:
                if child.name is not None:
                    # Replace ¶ links with placeholders and store href
                    if child.name == 'a' and 'headerlink' in (child.get('class') or []):
                        href = child.get("href")
                        if href:
                            placeholder = f" [LINK_{link_counter}] "
                            parts.append(placeholder)
                            link_refs.append(f"{placeholder} : {urljoin(full_url, href)}")
                            link_counter += 1
                        continue
                    elif child.name == 'span':
                        if 'w' in (child.get('class') or []):
                            parts.append(' ')  # span.w = space
                        else:
                            parts.append(clean_text(child, level + 1))  # inline, no space
                    else:
                        text = clean_text(child, level + 1)
                        if text:
                            parts.append(' '+ text + ('\n' if level == 0 else ' '))
                elif child.string:
                    stripped = child.string.strip()
                    if stripped:
                        parts.append(stripped)
            return ''.join(parts).strip()

        # Extract all links
        page_text = ""
        for child in content_div.children:
            if getattr(child, 'name', None):  # skip NavigableString or comments
                section_text = clean_text(child, level=0)
                if section_text:
                    page_text += section_text + '\n'
        result["page_text"] = page_text.strip() + ("\n\n" + "\n".join(link_refs) if link_refs else "")



        links = [urljoin(full_url, a.get("href")) for a in page_soup.find("div",class_="pytorch-content-left").find_all("a", href=True)]
        result["page_links"] = links

    except Exception as e:
        result["page_text"] = ""
        result["page_links"] = []

    return result


In [6]:
documents=soup.find_all("tr")
title=soup.find("h1")
title_text=title.next
print(title_text)

heading=title.find_next_sibling("p").text
print(heading)
    

torch.nn
These are the basic building blocks for graphs:


In [7]:
with open("doc_info.json", "w", encoding="utf-8") as f:
    # Write the opening of the JSON object. Giving it the heading
    f.write('{\n')
    f.write(f'"title": {json.dumps(title_text)},\n')
    f.write(f'"heading": {json.dumps(heading)},\n')
    f.write('"documents": [\n')

    documents = soup.find_all("tr")
    for i, row in enumerate(documents):
        d = extract_doc_info(row)
        json.dump(d, f, ensure_ascii=False, indent=2)

        if i < len(documents) - 1:
            f.write(',\n')
        else:
            f.write('\n')

    f.write(']\n}')