In [3]:
## Individual TAX information

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import time
from random import uniform

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

visited = set()

def extract_table_as_markdown(table):
    """Improved table converter with better handling of complex table structures"""
    rows = []
    max_cols = 0
    
    # Process all rows to find the maximum number of columns
    for tr in table.find_all('tr'):
        cells = tr.find_all(['th', 'td'])
        max_cols = max(max_cols, len(cells))
        row_data = []
        for cell in cells:
            # Clean text and handle non-breaking spaces
            text = cell.get_text(strip=True).replace('\xa0', ' ')
            row_data.append(text)
        if row_data:  # Skip empty rows
            rows.append(row_data)
    
    if not rows:
        return ""
    
    # Ensure all rows have the same number of columns
    for i, row in enumerate(rows):
        if len(row) < max_cols:
            rows[i].extend([''] * (max_cols - len(row)))
    
    # Create markdown table
    markdown = []
    markdown.append('| ' + ' | '.join(rows[0]) + ' |')
    markdown.append('|' + '|'.join(['-' * (len(col) + 2) for col in rows[0]]) + '|')
    
    for row in rows[1:]:
        markdown.append('| ' + ' | '.join(row) + ' |')
    
    return '\n'.join(markdown)

def get_page_text_and_links(url):
    try:
        time.sleep(uniform(1, 3))  # Be nice to the server
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        content_divs = soup.select("div[class*='content-col']")

        links = set()
        content = []
        seen_content = set()

        for div in content_divs:
            # Remove junk tags
            for tag in div(['script', 'style', 'nav', 'footer', 'iframe']):
                tag.decompose()

            # Process content in its original order
            for element in div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table']):
                if element.name == 'table':
                    # Convert table to markdown
                    markdown_table = extract_table_as_markdown(element)
                    if markdown_table:
                        content.append(markdown_table)

                    element.decompose()
                else:
                    # Process regular text elements
                    text = element.get_text('\n', strip=True)
                    if text and text not in seen_content:
                        content.append(text)
                        seen_content.add(text)
                        
        for a in soup.find_all("a", href=True):
            href = a['href']
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)

            # Strip the fragment (e.g. #site-content)
            cleaned_url = urlunparse(parsed._replace(fragment=""))

            if parsed.netloc == "www.hasil.gov.my" and parsed.path.startswith("/en/individual"):
                links.add(cleaned_url)

        final_content = "\n".join(content)
        return final_content, sorted(links)

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None, set()

def scrape_all_individual_pages(start_url):
    to_visit = [start_url]
    scraped_data = {}

    while to_visit:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)
        print(f"Scraping: {current_url}")

        page_text, found_links = get_page_text_and_links(current_url)
        if page_text:
            scraped_data[current_url] = page_text

        for link in found_links:
            if link not in visited and link not in to_visit:
                to_visit.append(link)
    print(f"✅ Total links visited: {len(visited)}")
        
    return scraped_data

# Start from main individual page
root_url = "https://www.hasil.gov.my/en/individual/"
results = scrape_all_individual_pages(root_url)

# Save results
with open("lhdn-test.txt", "w", encoding="utf-8") as f:
    for url, content in results.items():
        f.write(f"\n--- {url} ---\n{content}\n")

Scraping: https://www.hasil.gov.my/en/individual/
Scraping: https://www.hasil.gov.my/en/individual/individual-life-cycle/
Scraping: https://www.hasil.gov.my/en/individual/individual-life-cycle/how-to-declare-income/
Scraping: https://www.hasil.gov.my/en/individual/individual-life-cycle/payment/
Scraping: https://www.hasil.gov.my/en/individual/individual-life-cycle/registration/
Scraping: https://www.hasil.gov.my/en/individual/individual-life-cycle/residence-status/
Scraping: https://www.hasil.gov.my/en/individual/introduction-individual-income-tax/
❌ Error scraping https://www.hasil.gov.my/en/individual/introduction-individual-income-tax/: HTTPSConnectionPool(host='www.hasil.gov.my', port=443): Read timed out. (read timeout=15)
Scraping: https://www.hasil.gov.my/en/individual/introduction-individual-income-tax/how-to-file-your-tax/
Scraping: https://www.hasil.gov.my/en/individual/introduction-individual-income-tax/how-to-submit-your-itrf/
Scraping: https://www.hasil.gov.my/en/individua

In [None]:
## Company TAX information

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import time
from random import uniform

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

visited = set()

def extract_table_as_markdown(table):
    """Improved table converter with better handling of complex table structures"""
    rows = []
    max_cols = 0
    
    # Process all rows to find the maximum number of columns
    for tr in table.find_all('tr'):
        cells = tr.find_all(['th', 'td'])
        max_cols = max(max_cols, len(cells))
        row_data = []
        for cell in cells:
            # Clean text and handle non-breaking spaces
            text = cell.get_text(strip=True).replace('\xa0', ' ')
            row_data.append(text)
        if row_data:  # Skip empty rows
            rows.append(row_data)
    
    if not rows:
        return ""
    
    # Ensure all rows have the same number of columns
    for i, row in enumerate(rows):
        if len(row) < max_cols:
            rows[i].extend([''] * (max_cols - len(row)))
    
    # Create markdown table
    markdown = []
    markdown.append('| ' + ' | '.join(rows[0]) + ' |')
    markdown.append('|' + '|'.join(['-' * (len(col) + 2) for col in rows[0]]) + '|')
    
    for row in rows[1:]:
        markdown.append('| ' + ' | '.join(row) + ' |')
    
    return '\n'.join(markdown)

def get_page_text_and_links(url):
    try:
        time.sleep(uniform(1, 3))  # Be nice to the server
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        content_divs = soup.select("div[class*='content-col']")

        links = set()
        content = []
        seen_content = set()

        for div in content_divs:
            # Remove junk tags
            for tag in div(['script', 'style', 'nav', 'footer', 'iframe']):
                tag.decompose()

            # Process content in its original order
            for element in div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table']):
                if element.name == 'table':
                    # Convert table to markdown
                    markdown_table = extract_table_as_markdown(element)
                    if markdown_table:
                        content.append(markdown_table)

                    element.decompose()
                else:
                    # Process regular text elements
                    text = element.get_text('\n', strip=True)
                    if text and text not in seen_content:
                        content.append(text)
                        seen_content.add(text)
                        
        for a in soup.find_all("a", href=True):
            href = a['href']
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)

            # Strip the fragment (e.g. #site-content)
            cleaned_url = urlunparse(parsed._replace(fragment=""))

            if parsed.netloc == "www.hasil.gov.my" and parsed.path.startswith("/en/company"):
                links.add(cleaned_url)

        final_content = "\n".join(content)
        return final_content, sorted(links)

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None, set()

def scrape_all_individual_pages(start_url):
    to_visit = [start_url]
    scraped_data = {}

    while to_visit:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)
        print(f"Scraping: {current_url}")

        page_text, found_links = get_page_text_and_links(current_url)
        if page_text:
            scraped_data[current_url] = page_text

        for link in found_links:
            if link not in visited and link not in to_visit:
                to_visit.append(link)
    print(f"✅ Total links visited: {len(visited)}")
        
    return scraped_data

# Start from main individual page
root_url = "https://www.hasil.gov.my/en/company/"
results = scrape_all_individual_pages(root_url)

# Save results
with open("lhdn-company.txt", "w", encoding="utf-8") as f:
    for url, content in results.items():
        f.write(f"\n--- {url} ---\n{content}\n")

Scraping: https://www.hasil.gov.my/en/company/
{'https://www.hasil.gov.my/en/company/corporate-tax/', 'https://www.hasil.gov.my/en/company/digital-business/', 'https://www.hasil.gov.my/en/company/appeal/', 'https://www.hasil.gov.my/en/company/', 'https://www.hasil.gov.my/en/company/cooporative-tax/', 'https://www.hasil.gov.my/en/company/frequently-asked-question-company/', 'https://www.hasil.gov.my/en/company/tax-rate-of-company/', 'https://www.hasil.gov.my/en/company/basis-period-for-company/', 'https://www.hasil.gov.my/en/company/company-resident-status/', 'https://www.hasil.gov.my/en/company/amending-the-income-tax-return-form/', 'https://www.hasil.gov.my/en/company/certificate-of-resident/', 'https://www.hasil.gov.my/en/company/tax-estimation/', 'https://www.hasil.gov.my/en/company/update-company-information/', 'https://www.hasil.gov.my/en/company/non-resident-company/', 'https://www.hasil.gov.my/en/company/other-situation/', 'https://www.hasil.gov.my/en/company/sme/', 'https://www

In [4]:
## TEST information

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import time
from random import uniform

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

visited = set()

def extract_table_as_markdown(table):
    """Improved table converter with better handling of complex table structures"""
    rows = []
    max_cols = 0
    
    # Process all rows to find the maximum number of columns
    for tr in table.find_all('tr'):
        cells = tr.find_all(['th', 'td'])
        max_cols = max(max_cols, len(cells))
        row_data = []
        for cell in cells:
            # Clean text and handle non-breaking spaces
            text = cell.get_text(strip=True).replace('\xa0', ' ')
            row_data.append(text)
        if row_data:  # Skip empty rows
            rows.append(row_data)
    
    if not rows:
        return ""
    
    # Ensure all rows have the same number of columns
    for i, row in enumerate(rows):
        if len(row) < max_cols:
            rows[i].extend([''] * (max_cols - len(row)))
    
    # Create markdown table
    markdown = []
    markdown.append('| ' + ' | '.join(rows[0]) + ' |')
    markdown.append('|' + '|'.join(['-' * (len(col) + 2) for col in rows[0]]) + '|')
    
    for row in rows[1:]:
        markdown.append('| ' + ' | '.join(row) + ' |')
    
    return '\n'.join(markdown)

def get_page_text_and_links(url):
    try:
        time.sleep(uniform(1, 3))  # Be nice to the server
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        content_divs = soup.select("div[class*='content-col']")

        links = set()
        content = []
        seen_content = set()

        for div in content_divs:
            # Remove junk tags
            for tag in div(['script', 'style', 'nav', 'footer', 'iframe']):
                tag.decompose()

            # Process content in its original order
            for element in div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table']):
                if element.name == 'table':
                    # Convert table to markdown
                    markdown_table = extract_table_as_markdown(element)
                    if markdown_table:
                        content.append(markdown_table)

                    element.decompose()
                else:
                    # Process regular text elements
                    text = element.get_text('\n', strip=True)
                    if text and text not in seen_content:
                        content.append(text)
                        seen_content.add(text)
                        
        for a in soup.find_all("a", href=True):
            href = a['href']
            full_url = urljoin(url, href)
            parsed = urlparse(full_url)

            # Strip the fragment (e.g. #site-content)
            cleaned_url = urlunparse(parsed._replace(fragment=""))

            if parsed.netloc == "www.hasil.gov.my" and parsed.path.startswith("/en"):
                links.add(cleaned_url)

        final_content = "\n".join(content)
        return final_content, sorted(links)

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None, set()

def scrape_all_individual_pages(start_url):
    to_visit = [start_url]
    scraped_data = {}

    while to_visit:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)
        print(f"Scraping: {current_url}")

        page_text, found_links = get_page_text_and_links(current_url)
        if page_text:
            scraped_data[current_url] = page_text

        for link in found_links:
            if link not in visited and link not in to_visit:
                to_visit.append(link)
    print(f"✅ Total links visited: {len(visited)}")
        
    return scraped_data

# Start from main individual page
root_url = "https://www.hasil.gov.my/en/"
results = scrape_all_individual_pages(root_url)

# Save results
with open("lhdn-test.txt", "w", encoding="utf-8") as f:
    for url, content in results.items():
        f.write(f"\n--- {url} ---\n{content}\n")

Scraping: https://www.hasil.gov.my/en/
Scraping: https://www.hasil.gov.my/en/about-hasil/
Scraping: https://www.hasil.gov.my/en/about-hasil/academy/
Scraping: https://www.hasil.gov.my/en/about-hasil/best-tax-payer-award/
Scraping: https://www.hasil.gov.my/en/about-hasil/corporate-culture/
Scraping: https://www.hasil.gov.my/en/about-hasil/corporate-profile/
Scraping: https://www.hasil.gov.my/en/about-hasil/hasil-directory/
Scraping: https://www.hasil.gov.my/en/about-hasil/hasil-integrity/
Scraping: https://www.hasil.gov.my/en/announcement/?tajuk=
Scraping: https://www.hasil.gov.my/en/beware-of-tax-scams/
Scraping: https://www.hasil.gov.my/en/career/
Scraping: https://www.hasil.gov.my/en/company/
Scraping: https://www.hasil.gov.my/en/company/amending-the-income-tax-return-form/
Scraping: https://www.hasil.gov.my/en/company/appeal/
Scraping: https://www.hasil.gov.my/en/company/basis-period-for-company/
Scraping: https://www.hasil.gov.my/en/company/certificate-of-resident/
Scraping: https: