In [2]:
# Dependency setup (run this first)
import sys, subprocess, importlib, datetime

def install(pkg):
    print(f"Installing {pkg} ...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

REQUIRED = ["requests", "beautifulsoup4", "pandas"]  # base libs
for pkg in REQUIRED:
    module_name = pkg if pkg != "beautifulsoup4" else "bs4"
    try:
        importlib.import_module(module_name)
    except ImportError:
        install(pkg)

# Attempt lxml (preferred) with fallback to html5lib if build/wheel missing
PARSERS = []
try:
    importlib.import_module("lxml")
    PARSERS.append("lxml")
except ImportError:
    try:
        install("lxml")
        importlib.import_module("lxml")
        PARSERS.append("lxml")
    except Exception as e:
        print("Could not install lxml:", e)
        # fallback html5lib
        try:
            importlib.import_module("html5lib")
        except ImportError:
            install("html5lib")
        PARSERS.append("html5lib")

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
print("Dependencies ready at", datetime.datetime.utcnow(), "UTC", "Parsers available:", PARSERS)

Dependencies ready at 2025-08-11 14:44:46.400361 UTC Parsers available: ['lxml']


# web analyse

In [14]:
import requests
from bs4 import BeautifulSoup

detail_url = "https://disfold.com/company/toyota-motor-corporation/"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
}

resp = requests.get(detail_url, headers=headers, timeout=30)
soup = BeautifulSoup(resp.text, "lxml") if 'lxml' in PARSERS else BeautifulSoup(resp.text, "html.parser")

# Collect all section titles: h1, h2, h3, h4, h5, h6
section_titles = []
for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
    txt = tag.get_text(strip=True)
    if txt:
        section_titles.append(txt)

print("Section/Title list:")
for t in section_titles:
    print("-", t)



Section/Title list:
- Toyota
- Market Cap
- Toyota Categories
- Company Description
- Toyota market capitalization over time
- Top Competitors by Market Capitalization
- Industry: Auto Manufacturers
- Sector:
- Detailed Description
- Financials
- Stocks & Indices
- Product & Services
- Key People
- Details
- Tools
- Data
- Disfold DeepFinance
- Disfold
- Stay ahead with the Disfold Newsletter!


In [13]:
# 找出 Toyota Categories 对应的 content 的 block 的 class name 并抽取五个分类文本为 list
from bs4 import BeautifulSoup

# We assume 'soup' already exists from previous cell. If not, raise a helpful error.
try:
    _ = soup
except NameError:
    raise RuntimeError("'soup' is not defined. Run the earlier cell that fetches the Toyota detail page first.")

card_div = None
for div in soup.select('div.card-content'):
    h3 = div.find('h3')
    if h3 and ' categories' in h3.get_text(strip=True).lower():
        card_div = div
        break

if not card_div:
    raise ValueError("Could not locate the 'Toyota Categories' block on the page. The page structure may have changed.")

# Collect anchor text inside the block
raw_categories = [a.get_text(strip=True) for a in card_div.find_all('a') if a.get_text(strip=True)]

# Normalize spacing and keep original case version plus an UPPER version if desired
categories = [c.replace('\n', ' ').strip() for c in raw_categories]
print(categories)

# If you specifically want upper-case (as in the screenshot), uncomment below:
# categories_upper = [c.upper() for c in categories]
# print(categories_upper)

categories  # Final value

['Japan', 'Consumer Discretionary', 'Auto Manufacturers', 'Japan Consumer Discretionary', 'Japan Auto Manufacturers']


['Japan',
 'Consumer Discretionary',
 'Auto Manufacturers',
 'Japan Consumer Discretionary',
 'Japan Auto Manufacturers']