In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time  # Import the time module

# Function to scrape the links for each month and year
def scrape_month_year_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_date")
    links = []

    if container:
        for a in container.find_all('a', href=True):
            if 'docmonth' in a['href']:
                links.append(a['href'])
    return links

# Function to scrape the links to the documents
def scrape_document_links(doc_page_url):
    response = requests.get(doc_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_title")
    links = []

    if container:
        for li in container.find_all('li'):
            a = li.find('a', href=True)
            if a:
                links.append(a['href'])
    return links

# Function to scrape content from a document's URL and save as JSON
def scrape_and_save(url, output_folder):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve the webpage")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {"title": "", "text": "", "approval_date": "", "url": url}

    title_element = soup.find('h2')
    if title_element:
        cleaned_title = re.sub(r"[\[\]]", "", title_element.text).strip()
        data["title"] = cleaned_title
        filename = re.sub(r'[\/:*?"<>|]', '', cleaned_title) + '.json'

    full_text_div = soup.find('div', {'align': 'JUSTIFY'})
    if full_text_div:
        for sup in full_text_div.find_all('sup'):
            sup.decompose()
        data["text"] = full_text_div.get_text(separator="\n", strip=True)

    approval_date_match = re.search(r"Approved, ([\w\s,]+)\.", data["text"])
    if approval_date_match:
        data["approval_date"] = approval_date_match.group(1)

    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)
    print(f"Data has been saved to {filename}")

# Main execution function
def main():
    base_url = "https://elibrary.judiciary.gov.ph/thebookshelf/28"
    output_folder = 'Acts'
    os.makedirs(output_folder, exist_ok=True)

    month_year_links = scrape_month_year_links(base_url)
    for link in month_year_links:
        full_url = 'https://elibrary.judiciary.gov.ph' + link if not link.startswith('http') else link
        document_links = scrape_document_links(full_url)
        for doc_link in document_links:
            full_doc_url = 'https://elibrary.judiciary.gov.ph' + doc_link if not doc_link.startswith('http') else doc_link
            scrape_and_save(full_doc_url, output_folder)
            time.sleep(2)  # Sleep for 10 seconds after each document scrape

if __name__ == "__main__":
    main()


Data has been saved to Acts No. 4197, February 12, 1935.json
Data has been saved to Acts No. 4196, February 05, 1935.json
Data has been saved to Acts No. 4195, February 01, 1935.json
Data has been saved to Acts No. 4199, March 16, 1935.json
Data has been saved to Acts No. 4198, March 04, 1935.json
Data has been saved to Acts No. 4200, April 08, 1935.json
Data has been saved to Acts No. 4202, June 20, 1935.json
Data has been saved to Acts No. 4201, June 13, 1935.json
Data has been saved to Acts No. 4206, July 30, 1935.json
Data has been saved to Acts No. 4205, July 25, 1935.json
Data has been saved to Acts No. 4204, July 23, 1935.json
Data has been saved to Acts No. 4203, July 23, 1935.json
Data has been saved to Acts No. 4242, August 26, 1935.json
Data has been saved to Act No. 4241, August 23, 1935.json
Data has been saved to Acts No. 4240, August 22, 1935.json
Data has been saved to Acts No. 4239, August 22, 1935.json
Data has been saved to Acts No. 4238, August 21, 1935.json
Data ha

Data has been saved to Acts No. 4168, December 03, 1934.json
Data has been saved to Acts No. 4169, December 03, 1934.json
Data has been saved to Acts No. 4165, December 03, 1934.json
Data has been saved to Acts No. 4167, December 03, 1934.json
Data has been saved to Acts No. 4162, December 01, 1934.json
Data has been saved to Acts No. 4163, December 01, 1934.json
Data has been saved to Acts No. 4164, December 01, 1934.json
Data has been saved to Acts No. 4161, December 01, 1934.json
Data has been saved to Acts No. 4158, December 01, 1934.json
Data has been saved to Acts No. 4159, December 01, 1934.json
Data has been saved to Acts No. 4160, December 01, 1934.json
Data has been saved to Acts No. 4154, December 01, 1934.json
Data has been saved to Acts No. 4155, December 01, 1934.json
Data has been saved to Acts No. 4156, December 01, 1934.json
Data has been saved to Acts No. 4157, December 01, 1934.json
Data has been saved to Acts No. 4041, January 21, 1933.json
Data has been saved to Ac

Data has been saved to Act No. 4037, December 17, 1932.json
Data has been saved to Acts No. 4033, December 09, 1932.json
Data has been saved to Act No. 4032, December 09, 1932.json
Data has been saved to Acts No. 4031, December 08, 1932.json
Data has been saved to Acts No. 4030, December 08, 1932.json
Data has been saved to Acts No. 4027, December 08, 1932.json
Data has been saved to Acts No. 4028, December 08, 1932.json
Data has been saved to Acts No. 4029, December 08, 1932.json
Data has been saved to Acts No. 4024, December 08, 1932.json
Data has been saved to Acts No. 4025, December 08, 1932.json
Data has been saved to Acts No. 4026, December 08, 1932.json
Data has been saved to Acts No. 4023, December 08, 1932.json
Data has been saved to Acts No. 4022, December 08, 1932.json
Data has been saved to Acts No. 4021, December 08, 1932.json
Data has been saved to Acts No. 4019, December 07, 1932.json
Data has been saved to Acts No. 4018, December 07, 1932.json
Data has been saved to Act

Data has been saved to Act No. 3880, November 14, 1931.json
Data has been saved to Act No. 3870, November 13, 1931.json
Data has been saved to Act No. 3879, November 13, 1931.json
Data has been saved to Act No. 3876, November 13, 1931.json
Data has been saved to Act No. 3877, November 13, 1931.json
Data has been saved to Act No. 3878, November 13, 1931.json
Data has been saved to Act No. 3872, November 13, 1931.json
Data has been saved to Act No. 3873, November 13, 1931.json
Data has been saved to Act No. 3874, November 13, 1931.json
Data has been saved to Act No. 3875, November 13, 1931.json
Data has been saved to Act No. 3871, November 13, 1931.json
Data has been saved to Act No. 3866, November 13, 1931.json
Data has been saved to Act No. 3867, November 13, 1931.json
Data has been saved to Act No. 3868, November 13, 1931.json
Data has been saved to Act No. 3869, November 13, 1931.json
Data has been saved to Act No. 3864, November 13, 1931.json
Data has been saved to Act No. 3865, Nov

Data has been saved to Act No. 3692, November 20, 1930.json
Data has been saved to Act No. 3690, November 20, 1930.json
Data has been saved to Act No. 3691, November 20, 1930.json
Data has been saved to Act No. 3706, November 20, 1930.json
Data has been saved to Act No. 3707, November 20, 1930.json
Data has been saved to Act No. 3708, November 20, 1930.json
Data has been saved to Act No. 3709, November 20, 1930.json
Data has been saved to Act No. 3710, November 20, 1930.json
Data has been saved to Act No. 3711, November 20, 1930.json
Data has been saved to Act No. 3712, November 20, 1930.json
Data has been saved to Act No. 3713, November 20, 1930.json
Data has been saved to Act No. 3703, November 20, 1930.json
Data has been saved to Act No. 3689, November 11, 1930.json
Data has been saved to Act No. 3687, November 07, 1930.json
Data has been saved to Act No. 3688, November 07, 1930.json
Data has been saved to Act No. 3686, November 06, 1930.json
Data has been saved to Act No. 3685, Nov

Data has been saved to Act No. 3537, November 02, 1929.json
Data has been saved to Act No. 3536, November 01, 1929.json
Data has been saved to Act No. 3669, December 08, 1929.json
Data has been saved to Act No. 3668, December 08, 1929.json
Data has been saved to Act No. 3667, December 08, 1929.json
Data has been saved to Act No. 3670, December 08, 1929.json
Data has been saved to Act No. 3658, December 07, 1929.json
Data has been saved to Act No. 3666, December 07, 1929.json
Data has been saved to Act No. 3665, December 07, 1929.json
Data has been saved to Act No. 3664, December 07, 1929.json
Data has been saved to Act No. 3663, December 07, 1929.json
Data has been saved to Act No. 3662, December 07, 1929.json
Data has been saved to Act No. 3661, December 07, 1929.json
Data has been saved to Act No. 3660, December 07, 1929.json
Data has been saved to Act No. 3659, December 07, 1929.json
Data has been saved to Act No. 3657, December 07, 1929.json
Data has been saved to Act No. 3656, Dec

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))