In [2]:
import requests

In [25]:
def list_articles(base_helpcenter_url: str, locale: str | None = None, limit: int = 100) -> list[dict]:
    session = requests.Session()
    session.headers.update({"Accept": "application/json"})

    path = f"/api/v2/help_center/{locale}/articles.json" if locale else "/api/v2/help_center/articles.json"
    url = base_helpcenter_url.rstrip("/") + path

    out = []
    while url and len(out) < limit:
        r = session.get(url, timeout=30)
        r.raise_for_status()
        data = r.json()

        out.extend(data.get("articles", []))
        url = data.get("next_page")  # Zendesk commonly returns absolute next_page

    return out

In [26]:
articles = list_articles("https://support.optisigns.com", locale="en-us")
for a in articles:
    print(a["id"], a.get("title"), a.get("html_url"))

48241081473043 Operational Schedule Troubleshooting https://support.optisigns.com/hc/en-us/articles/48241081473043-Operational-Schedule-Troubleshooting
47616485609491 How to Use the OptiDev App https://support.optisigns.com/hc/en-us/articles/47616485609491-How-to-Use-the-OptiDev-App
46299245284243 OptiKiosk 50 Setup and Troubleshooting Guide https://support.optisigns.com/hc/en-us/articles/46299245284243-OptiKiosk-50-Setup-and-Troubleshooting-Guide
45619214182803 How to Set Up an Outlook Calendar App with Shared Permissions https://support.optisigns.com/hc/en-us/articles/45619214182803-How-to-Set-Up-an-Outlook-Calendar-App-with-Shared-Permissions
44890229616403 Using an Enterprise Network (802.1x) with OptiSigns https://support.optisigns.com/hc/en-us/articles/44890229616403-Using-an-Enterprise-Network-802-1x-with-OptiSigns
43751835488403 Chrome App - End of Support on ChromeOS and Workarounds https://support.optisigns.com/hc/en-us/articles/43751835488403-Chrome-App-End-of-Support-on-Chr

In [27]:
test = articles[0]
print(test)

{'id': 48241081473043, 'url': 'https://optisignshelp.zendesk.com/api/v2/help_center/en-us/articles/48241081473043.json', 'html_url': 'https://support.optisigns.com/hc/en-us/articles/48241081473043-Operational-Schedule-Troubleshooting', 'author_id': 30963645258899, 'comments_disabled': False, 'draft': False, 'promoted': False, 'position': 0, 'vote_sum': 0, 'vote_count': 0, 'section_id': 26319502894611, 'created_at': '2026-01-15T16:38:40Z', 'updated_at': '2026-01-15T16:39:05Z', 'name': 'Operational Schedule Troubleshooting', 'title': 'Operational Schedule Troubleshooting', 'source_locale': 'en-us', 'locale': 'en-us', 'outdated': False, 'outdated_locales': [], 'edited_at': '2026-01-15T16:39:05Z', 'user_segment_id': None, 'permission_group_id': 787493, 'content_tag_ids': ['01JRB2GB1DT3B11J5GYVRCB069'], 'label_names': ['troubleshooting', 'Troubleshooting Page', 'Hardware Troubleshooting'], 'body': '<h3 id="h_01KF15FS3KCYCA0A8J519XB3ZA"><span style="color: #434343;">In this article, we will 

In [6]:
test_body = test['body']
with open("test_article.html", "w", encoding="utf-8") as f:
    f.write(test_body)

In [14]:
import json
import re
from pathlib import Path
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import unicodedata

In [8]:
def clean_soup(soup: BeautifulSoup) -> BeautifulSoup:

    for tag in soup(["script", "style"]):
        tag.decompose()


    for tag in soup.find_all("span"):
        tag.unwrap()

    for a in soup.find_all("a"):
        if a.has_attr("name") and not a.get_text(strip=True) and not a.has_attr("href"):

            a.replace_with(soup.new_string(f"\n<!-- anchor:{a['name']} -->\n"))

    for p in soup.find_all("p"):
        if not p.get_text(strip=True) and not p.find("img"):
            p.decompose()

    return soup

In [9]:
def html_to_markdown(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    soup = clean_soup(soup)


    markdown = md(
        str(soup),
        heading_style="ATX", 
        bullets="-",
        strip=["figure"],     
    )

    markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()

    return markdown


In [10]:
test['body']

'<h3 id="h_01KF15FS3KCYCA0A8J519XB3ZA"><span style="color: #434343;">In this article, we will troubleshoot common issues related to the Operational Schedule feature in OptiSigns.</span></h3><ul>\n<li data-list-item-id="e8c15c353de4465061a5afc3b0d1fb293"><a href="#ChangingSettings">Changing Display Settings</a></li>\n<li data-list-item-id="e3ff52eb58cc3aa780d9bd38bbbc73411"><a href="#TestConnections">How to Test HDMI-CEC / RS-232 Connections</a></li>\n<li data-list-item-id="e9289e3aaf95e50162ab19d523b9ed63b"><a href="#ImproperlyPowered">OptiStick Improperly Powered</a></li>\n<li data-list-item-id="eaf20e626444182a7e9647b624a360c52"><a href="#DifferentTimes">Operational Schedule Works at Different Times from How It Was Set</a></li>\n<li data-list-item-id="e7d35002744500cc185dd8f0ee1106d57"><a href="#TurnOff">Screen Turns Off, But Not Back On</a></li>\n</ul><p>Operational Scheduling is a feature which allows you to turn off and on your digital signs on a set schedule, further automating y

In [11]:
def fetch_full_article_if_needed(article: dict) -> dict:
    if article.get("body"):
        return article

    api_url = article.get("url")
    if not api_url:
        raise ValueError("article dict thiếu field `body` và cũng không có `url` để fetch thêm.")

    r = requests.get(api_url, timeout=30)
    r.raise_for_status()
    data = r.json()

    if isinstance(data, dict) and "article" in data:
        return data["article"]

    return data


In [15]:

def safe_slug(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    s = s.encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"[^\w\s-]", "", s).strip().lower()
    s = re.sub(r"[-\s]+", "-", s)
    return s or "article"

In [18]:
def convert_one_article_to_md(article: dict, out_dir: str = "data/md") -> Path:
    article = fetch_full_article_if_needed(article)

    article_id = article.get("id")
    title = article.get("title") or f"article-{article_id}"
    url = article.get("html_url") or article.get("url") or ""
    updated_at = article.get("updated_at") or ""
    labels = article.get("label_names") or []

    body_html = article.get("body") or ""
    body_md = html_to_markdown(body_html)

    # YAML front matter (giúp debug/filter)
    front = (
        "---\n"
        f"id: {article_id}\n"
        f"title: {json.dumps(title, ensure_ascii=False)}\n"
        f"url: {url}\n"
        f"updated_at: {updated_at}\n"
        f"labels: {json.dumps(labels, ensure_ascii=False)}\n"
        "---\n\n"
    )

    content = "\n".join([
        front.rstrip(),
        f"# {title}",
        "",
        f"Article URL: {url}",
        "",
        body_md.strip(),
        ""
    ])

    out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    fname = f"{article_id}-{safe_slug(title)}.md" if article_id else f"{safe_slug(title)}.md"

    md_path = out_path / fname
    md_path.write_text(content, encoding="utf-8")
    return md_path

In [None]:
md_file = convert_one_article_to_md(test, out_dir="data/md_test")
print(f"Saved markdown to: {md_file}")

Saved markdown to: data\md_test\48241081473043-operational-schedule-troubleshooting.md


In [30]:
test2 = None
for article in articles:
    body = article['body']
    if '<pre>' in body:
        test2 = article
        print(f"Article ID {article['id']} contains code block.")

md_file = convert_one_article_to_md(test2, out_dir="data/md_test")
print(f"Saved markdown to: {md_file}")

Article ID 39080869746067 contains code block.
Article ID 36911639377683 contains code block.
Article ID 35184720136595 contains code block.
Article ID 31860170199955 contains code block.
Article ID 31113088917907 contains code block.
Article ID 23274673797139 contains code block.
Article ID 15861880059283 contains code block.
Article ID 8099928867475 contains code block.
Article ID 4411956075027 contains code block.
Article ID 4407493404307 contains code block.
Article ID 4404590815635 contains code block.
Saved markdown to: data\md_test\4404590815635-how-to-set-up-saml-20-with-optisigns-and-okta.md
