In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time

def scrape_month_year_links(base_url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_date")
    links = []

    if container:
        for a in container.find_all('a', href=True):
            if 'docmonth' in a['href']:
                links.append(a['href'])
    return links

def scrape_document_links(doc_page_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(doc_page_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_title")
    links = []

    if container:
        for li in container.find_all('li'):
            a = li.find('a', href=True)
            if a:
                links.append(a['href'])
    return links

def scrape_and_save(url, output_folder):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    retries = 3
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                break
            else:
                time.sleep(5 ** attempt)  # Exponential backoff
        except requests.exceptions.RequestException:
            if attempt < retries - 1:  # i.e., not last attempt
                time.sleep(5 ** attempt)
                continue
            else:
                print("Failed to retrieve the webpage after multiple attempts")
                return

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {"title": "", "text": "", "approval_date": "", "url": url}

    title_element = soup.find('h2')
    if title_element:
        cleaned_title = re.sub(r"[\[\]]", "", title_element.text).strip()
        data["title"] = cleaned_title
        filename = re.sub(r'[\/:*?"<>|]', '', cleaned_title) + '.json'

    full_text_div = soup.find('div', {'align': 'JUSTIFY'})
    if full_text_div:
        for sup in full_text_div.find_all('sup'):
            sup.decompose()
        # Extract text separated by <br> tags
        text_parts = [part.strip() for part in full_text_div.get_text(separator="\n", strip=True).split("\n") if part.strip()]
        data["text"] = "\n\n".join(text_parts)

    approval_date_match = re.search(r"Approved, ([\w\s,]+)\.", data["text"])
    if approval_date_match:
        data["approval_date"] = approval_date_match.group(1)

    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)
    print(f"Data has been saved to {filename}")

def main():
    base_url = "https://elibrary.judiciary.gov.ph/thebookshelf/28"
    output_folder = 'Acts'
    os.makedirs(output_folder, exist_ok=True)

    month_year_links = scrape_month_year_links(base_url)
    for link in month_year_links:
        full_url = 'https://elibrary.judiciary.gov.ph' + link if not link.startswith('http') else link
        document_links = scrape_document_links(full_url)
        for doc_link in document_links:
            full_doc_url = 'https://elibrary.judiciary.gov.ph' + doc_link if not doc_link.startswith('http') else doc_link
            scrape_and_save(full_doc_url, output_folder)
#             time.sleep(5)  # Sleep for 5 seconds after each document scrape to avoid overwhelming the server

if __name__ == "__main__":
    main()


Data has been saved to BATAS PAMBANSA BLG. 866, January 31, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 868, February 20, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 869, April 18, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 870, April 18, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 871, May 29, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 873, June 12, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 874, June 12, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 875, June 12, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 876, June 12, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 877, June 12, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 872, June 10, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 878, July 09, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 879, October 22, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 880, October 22, 1985.json
Data has been saved to BATAS PAMBANSA BLG. 882, 

Data has been saved to BATAS PAMBANSA BLG. 827, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 828, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 829, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 830, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 831, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 832, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 833, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 834, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 835, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 836, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 837, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 838, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 839, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 840, April 27, 1984.json
Data has been saved to BATAS PAMBANSA BLG. 841, 

KeyboardInterrupt: 