# MyUpchar blogs extraction


## 1. Article scrapper code:


In [1]:
import requests
from bs4 import BeautifulSoup
import json, os


def clean_text_with_links(tag):
    """Rebuild paragraph text while preserving <a> tag text with space."""
    parts = []
    for child in tag.descendants:
        if child.name == "a":
            text = child.get_text(strip=True)
            parts.append(text)
        elif isinstance(child, str):
            parts.append(child)
    return "".join(parts).strip()


def extract_intro_content(content_body_div):
    """Extracts the content before sections (main body / intro)."""
    intro = []
    for child in content_body_div.children:
        if getattr(child, "name", None) == "p":
            strong = child.find("strong")
            if strong and strong.text.strip() == child.text.strip():
                intro.append("### " + strong.get_text(strip=True))
            else:
                intro.append(clean_text_with_links(child))
        elif getattr(child, "name", None) == "ul":
            for li in child.find_all("li"):
                intro.append("• " + clean_text_with_links(li))
    return intro


def scrape_myupchar_article(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No Title"
    content = {"url": url, "title": title, "intro": [], "sections": []}

    # === Extract Main Body (before sections)
    intro_div = soup.find("div", id="content-body")
    if intro_div:
        content["intro"] = extract_intro_content(intro_div)

    # === Extract Structured Sections
    for section in soup.select(".category_blue_backdiv"):
        heading_tag = section.find("h2")
        body_tag = section.find("div", class_="description")

        section_title = (
            heading_tag.get_text(strip=True) if heading_tag else "No Heading"
        )
        section_text = []

        if body_tag:
            for child in body_tag.children:
                if child.name == "p":
                    strong = child.find("strong")
                    if strong and strong.text.strip() == child.text.strip():
                        section_text.append("### " + strong.get_text(strip=True))
                    else:
                        section_text.append(clean_text_with_links(child))
                elif child.name == "ul":
                    for li in child.find_all("li"):
                        section_text.append("• " + clean_text_with_links(li))
                elif child.name == "div":
                    for subchild in child.children:
                        if subchild.name == "p":
                            section_text.append(clean_text_with_links(subchild))
                        elif subchild.name == "ul":
                            for li in subchild.find_all("li"):
                                section_text.append("• " + clean_text_with_links(li))

        content["sections"].append({"heading": section_title, "body": section_text})

    return content

### 1.1. Save as Markdown


In [2]:
def save_as_markdown(article_data, filename):
    with open(filename, "w", encoding="utf-8") as md:
        md.write(f"# {article_data['title']}\n")
        md.write(f"_Source: [{article_data['url']}]({article_data['url']})_\n\n")

        if article_data["intro"]:
            md.write("## Introduction\n")
            for line in article_data["intro"]:
                if line.startswith("### "):
                    md.write(f"{line}\n")
                elif line.startswith("• "):
                    md.write(f"- {line[2:]}\n")
                else:
                    md.write(f"{line}\n")
            md.write("\n")

        for sec in article_data["sections"]:
            md.write(f"## {sec['heading']}\n")
            for line in sec["body"]:
                if line.startswith("### "):
                    md.write(f"{line}\n")
                elif line.startswith("• "):
                    md.write(f"- {line[2:]}\n")
                else:
                    md.write(f"{line}\n")
            md.write("\n")

### 1.2. Single URL Scrapper


In [19]:
url = "https://www.myupchar.com/en/disease/acidity"
article = scrape_myupchar_article(url)

# File-safe title
safe_title = article["title"].lower().replace(" ", "_").replace("/", "_")

# Create folders if not exist
os.makedirs("blogs", exist_ok=True)
os.makedirs("jsonblogs", exist_ok=True)
os.makedirs("markdownblogs", exist_ok=True)

# 1. Save as readable text
text_filename = f"blogs/{safe_title}.txt"
with open(text_filename, "w", encoding="utf-8") as f:
    f.write(f"Title: {article['title']}\n")
    f.write(f"URL: {article['url']}\n\n")
    f.write("== Introduction ==\n")
    for line in article["intro"]:
        f.write(f"{line}\n")

    for sec in article["sections"]:
        f.write(f"\n== {sec['heading']} ==\n")
        for line in sec["body"]:
            f.write(f"{line}\n")

# 2. Save as structured JSON
json_filename = f"jsonblogs/{safe_title}.json"
with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(article, f, indent=4, ensure_ascii=False)

# 3. Save as markdown
md_filename = f"markdownblogs/{safe_title}.md"
save_as_markdown(article, md_filename)

print(f"✅ Saved as:\n- {text_filename}\n- {json_filename}\n- {md_filename}")

✅ Saved as:
- blogs/acidity.txt
- jsonblogs/acidity.json
- markdownblogs/acidity.md


### 1.3. Bulk URL Scrapper


#### 1.3.1. Extract Links from Category page


In [None]:
from bs4 import BeautifulSoup


def extract_blog_urls(html_path, base_url="https://www.myupchar.com"):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(html_path, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    links = set()

    for h4 in soup.find_all("h4"):
        a_tag = h4.find("a", href=True)
        if a_tag:
            href = a_tag["href"]
            if href.startswith("/en/"):
                links.add(base_url + href.strip())

    return sorted(links)


urls = extract_blog_urls(
    "https://www.myupchar.com/en/disease/stomach-liver-mouth-and-digestion-related-diseases"
)

with open("myupchar_urls.txt", "w", encoding="utf-8") as f:
    for url in urls:
        f.write(url + "\n")

print(f"✅ Extracted {len(urls)} article URLs from <h4> tags.")

✅ Extracted 85 article URLs from <h4> tags.


#### 1.3.2. Bulk Scrap articles from saved links


In [4]:
import os
import time
import json
import random
from datetime import datetime

# Reuse the existing scrape_myupchar_article() and save_as_markdown() functions
# Make sure those are defined/imported in this script


def save_all_formats(article):
    safe_title = article["title"].lower().replace(" ", "_").replace("/", "_")

    # Ensure output folders exist
    os.makedirs("blogs", exist_ok=True)
    os.makedirs("jsonblogs", exist_ok=True)
    os.makedirs("markdownblogs", exist_ok=True)

    # Save .txt
    with open(f"blogs/{safe_title}.txt", "w", encoding="utf-8") as f:
        f.write(f"Title: {article['title']}\n")
        f.write(f"URL: {article['url']}\n\n")
        f.write("== Introduction ==\n")
        for line in article["intro"]:
            f.write(f"{line}\n")
        for sec in article["sections"]:
            f.write(f"\n== {sec['heading']} ==\n")
            for line in sec["body"]:
                f.write(f"{line}\n")

    # Save .json
    with open(f"jsonblogs/{safe_title}.json", "w", encoding="utf-8") as f:
        json.dump(article, f, indent=4, ensure_ascii=False)

    # Save .md
    save_as_markdown(article, f"markdownblogs/{safe_title}.md")


def bulk_scrape(url_list, delay_range=(2, 5), resume_failed=True):
    failed_urls = []
    success_index = []

    for idx, url in enumerate(url_list, start=1):
        print(f"[{idx}/{len(url_list)}] Scraping: {url}")
        try:
            article = scrape_myupchar_article(url)
            safe_title = article["title"].lower().replace(" ", "_").replace("/", "_")

            # Skip if already exists
            if os.path.exists(f"jsonblogs/{safe_title}.json"):
                print(f"⚠️ Skipped (already scraped): {article['title']}")
                continue

            save_all_formats(article)
            print(f"✅ Success: {article['title']}")

            # Add to master index
            success_index.append(
                {
                    "title": article["title"],
                    "slug": safe_title,
                    "url": article["url"],
                    "files": {
                        "txt": f"blogs/{safe_title}.txt",
                        "json": f"jsonblogs/{safe_title}.json",
                        "md": f"markdownblogs/{safe_title}.md",
                    },
                }
            )

        except Exception as e:
            print(f"❌ Failed: {url}\n   Reason: {e}")
            failed_urls.append(url)

        time.sleep(random.uniform(*delay_range))

    # Save failed URLs
    if failed_urls:
        with open("failed_urls.txt", "w", encoding="utf-8") as f:
            for url in failed_urls:
                f.write(url + "\n")
        print(f"⚠️ Failed URLs saved to 'failed_urls.txt'")

    # Save master index
    with open("master_index.json", "w", encoding="utf-8") as f:
        json.dump(success_index, f, indent=4, ensure_ascii=False)
    print(f"📚 Master index saved to 'master_index.json'")

    print(
        f"\n🟢 Bulk scraping finished with {len(success_index)} success, {len(failed_urls)} failed."
    )

In [5]:
# Load main list of URLs
with open("myupchar_urls.txt", "r", encoding="utf-8") as f:
    all_urls = [line.strip() for line in f if line.strip()]

# Load failed URLs (if resume is enabled)
if os.path.exists("failed_urls.txt"):
    with open("failed_urls.txt", "r", encoding="utf-8") as f:
        failed = [line.strip() for line in f if line.strip()]
    all_urls.extend(failed)

# Remove duplicates
all_urls = list(set(all_urls))

# Start scraper
bulk_scrape(all_urls)

[1/85] Scraping: https://www.myupchar.com/en/disease/fatty-liver
✅ Success: Fatty Liver
[2/85] Scraping: https://www.myupchar.com/en/disease/anal-fissure
✅ Success: Anal Fissure
[3/85] Scraping: https://www.myupchar.com/en/disease/colon-infection
✅ Success: Colon infection
[4/85] Scraping: https://www.myupchar.com/en/disease/dental-caries-cavities
✅ Success: Cavities (Dental Caries)
[5/85] Scraping: https://www.myupchar.com/en/disease/teeth-infections
✅ Success: Teeth Infections
[6/85] Scraping: https://www.myupchar.com/en/disease/appendicitis
✅ Success: Appendicitis
[7/85] Scraping: https://www.myupchar.com/en/disease/hernia
✅ Success: Hernia
[8/85] Scraping: https://www.myupchar.com/en/disease/mouth-ulcer
✅ Success: Mouth ulcer
[9/85] Scraping: https://www.myupchar.com/en/disease/hepatitis-c
✅ Success: Hepatitis C
[10/85] Scraping: https://www.myupchar.com/en/disease/liver-cirrhosis
✅ Success: Liver Cirrhosis
[11/85] Scraping: https://www.myupchar.com/en/disease/black-tongue
✅ Succes