In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import time

def scrape_month_year_links(base_url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_date")
    links = []

    if container:
        for a in container.find_all('a', href=True):
            if 'docmonth' in a['href']:
                links.append(a['href'])
    return links

def scrape_document_links(doc_page_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(doc_page_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find(id="container_title")
    links = []

    if container:
        for li in container.find_all('li'):
            a = li.find('a', href=True)
            if a:
                links.append(a['href'])
    return links

def scrape_and_save(url, output_folder):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    retries = 3
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                break
            else:
                time.sleep(5 ** attempt)  # Exponential backoff
        except requests.exceptions.RequestException:
            if attempt < retries - 1:  # i.e., not last attempt
                time.sleep(5 ** attempt)
                continue
            else:
                print("Failed to retrieve the webpage after multiple attempts")
                return

    soup = BeautifulSoup(response.text, 'html.parser')
    data = {"title": "", "text": "", "approval_date": "", "url": url}

    title_element = soup.find('h2')
    if title_element:
        cleaned_title = re.sub(r"[\[\]]", "", title_element.text).strip()
        data["title"] = cleaned_title
        filename = re.sub(r'[\/:*?"<>|]', '', cleaned_title) + '.json'

    # Find the div after the H3 tag
    div_elements = soup.find_all('div', {'align': 'justify'})
    text_parts = []
    for div in div_elements:
        # Clean up and decompose <sup> tags which are usually references or footnotes
        for sup in div.find_all('sup'):
            sup.decompose()
        text_parts.append(div.get_text(separator="\n", strip=True))

    data["text"] = "\n\n".join(text_parts)

    approval_date_match = re.search(r"Approved, ([\w\s,]+)\.", data["text"])
    if approval_date_match:
        data["approval_date"] = approval_date_match.group(1)

    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)
    print(f"Data has been saved to {filename}")


def main():
    base_url = "https://elibrary.judiciary.gov.ph/thebookshelf/28"
    output_folder = 'Republic Acts'
    os.makedirs(output_folder, exist_ok=True)

    month_year_links = scrape_month_year_links(base_url)
    for link in month_year_links:
        full_url = 'https://elibrary.judiciary.gov.ph' + link if not link.startswith('http') else link
        document_links = scrape_document_links(full_url)
        for doc_link in document_links:
            full_doc_url = 'https://elibrary.judiciary.gov.ph' + doc_link if not doc_link.startswith('http') else doc_link
            scrape_and_save(full_doc_url, output_folder)
#             time.sleep(5)  # Sleep for 5 seconds after each document scrape to avoid overwhelming the server

if __name__ == "__main__":
    main()


Data has been saved to Acts No. 4197, February 12, 1935.json
Data has been saved to Acts No. 4196, February 05, 1935.json
Data has been saved to Acts No. 4195, February 01, 1935.json
Data has been saved to Acts No. 4199, March 16, 1935.json
Data has been saved to Acts No. 4198, March 04, 1935.json
Data has been saved to Acts No. 4200, April 08, 1935.json
Data has been saved to Acts No. 4202, June 20, 1935.json
Data has been saved to Acts No. 4201, June 13, 1935.json
Data has been saved to Acts No. 4206, July 30, 1935.json
Data has been saved to Acts No. 4205, July 25, 1935.json
Data has been saved to Acts No. 4204, July 23, 1935.json
Data has been saved to Acts No. 4203, July 23, 1935.json
Data has been saved to Acts No. 4242, August 26, 1935.json
Data has been saved to Act No. 4241, August 23, 1935.json
Data has been saved to Acts No. 4240, August 22, 1935.json
Data has been saved to Acts No. 4239, August 22, 1935.json
Data has been saved to Acts No. 4238, August 21, 1935.json
Data ha

Data has been saved to Acts No. 4169, December 03, 1934.json
Data has been saved to Acts No. 4165, December 03, 1934.json
Data has been saved to Acts No. 4167, December 03, 1934.json
Data has been saved to Acts No. 4162, December 01, 1934.json
Data has been saved to Acts No. 4163, December 01, 1934.json
Data has been saved to Acts No. 4164, December 01, 1934.json
Data has been saved to Acts No. 4161, December 01, 1934.json
Data has been saved to Acts No. 4158, December 01, 1934.json
Data has been saved to Acts No. 4159, December 01, 1934.json
Data has been saved to Acts No. 4160, December 01, 1934.json
Data has been saved to Acts No. 4154, December 01, 1934.json
Data has been saved to Acts No. 4155, December 01, 1934.json
Data has been saved to Acts No. 4156, December 01, 1934.json
Data has been saved to Acts No. 4157, December 01, 1934.json
Data has been saved to Acts No. 4041, January 21, 1933.json
Data has been saved to Acts No. 4039, January 06, 1933.json
Data has been saved to Act

Data has been saved to Acts No. 4033, December 09, 1932.json
Data has been saved to Act No. 4032, December 09, 1932.json
Data has been saved to Acts No. 4031, December 08, 1932.json
Data has been saved to Acts No. 4030, December 08, 1932.json
Data has been saved to Acts No. 4027, December 08, 1932.json
Data has been saved to Acts No. 4028, December 08, 1932.json
Data has been saved to Acts No. 4029, December 08, 1932.json
Data has been saved to Acts No. 4024, December 08, 1932.json
Data has been saved to Acts No. 4025, December 08, 1932.json
Data has been saved to Acts No. 4026, December 08, 1932.json
Data has been saved to Acts No. 4023, December 08, 1932.json
Data has been saved to Acts No. 4022, December 08, 1932.json
Data has been saved to Acts No. 4021, December 08, 1932.json
Data has been saved to Acts No. 4019, December 07, 1932.json
Data has been saved to Acts No. 4018, December 07, 1932.json
Data has been saved to Acts No. 4020, December 07, 1932.json
Data has been saved to Ac

Data has been saved to Act No. 3870, November 13, 1931.json
Data has been saved to Act No. 3879, November 13, 1931.json
Data has been saved to Act No. 3876, November 13, 1931.json
Data has been saved to Act No. 3877, November 13, 1931.json
Data has been saved to Act No. 3878, November 13, 1931.json
Data has been saved to Act No. 3872, November 13, 1931.json
Data has been saved to Act No. 3873, November 13, 1931.json
Data has been saved to Act No. 3874, November 13, 1931.json
Data has been saved to Act No. 3875, November 13, 1931.json
Data has been saved to Act No. 3871, November 13, 1931.json
Data has been saved to Act No. 3866, November 13, 1931.json
Data has been saved to Act No. 3867, November 13, 1931.json
Data has been saved to Act No. 3868, November 13, 1931.json
Data has been saved to Act No. 3869, November 13, 1931.json
Data has been saved to Act No. 3864, November 13, 1931.json
Data has been saved to Act No. 3865, November 13, 1931.json
Data has been saved to Act No. 3863, Nov

Data has been saved to Act No. 3699, November 20, 1930.json
Data has been saved to Act No. 3700, November 20, 1930.json
Data has been saved to Act No. 3701, November 20, 1930.json
Data has been saved to Act No. 3702, November 20, 1930.json
Data has been saved to Act No. 3696, November 20, 1930.json
Data has been saved to Act No. 3693, November 20, 1930.json
Data has been saved to Act No. 3694, November 20, 1930.json
Data has been saved to Act No. 3692, November 20, 1930.json
Data has been saved to Act No. 3690, November 20, 1930.json
Data has been saved to Act No. 3691, November 20, 1930.json
Data has been saved to Act No. 3706, November 20, 1930.json
Data has been saved to Act No. 3707, November 20, 1930.json
Data has been saved to Act No. 3708, November 20, 1930.json
Data has been saved to Act No. 3709, November 20, 1930.json
Data has been saved to Act No. 3710, November 20, 1930.json
Data has been saved to Act No. 3711, November 20, 1930.json
Data has been saved to Act No. 3712, Nov

Data has been saved to Act No. 3544, November 22, 1929.json
Data has been saved to Act No. 3545, November 22, 1929.json
Data has been saved to Act No. 3546, November 22, 1929.json
Data has been saved to Act No. 3547, November 22, 1929.json
Data has been saved to Act No. 3548, November 22, 1929.json
Data has been saved to Act No. 3538, November 13, 1929.json
Data has been saved to Act No. 3537, November 02, 1929.json
Data has been saved to Act No. 3536, November 01, 1929.json
Data has been saved to Act No. 3669, December 08, 1929.json
Data has been saved to Act No. 3668, December 08, 1929.json
Data has been saved to Act No. 3667, December 08, 1929.json
Data has been saved to Act No. 3670, December 08, 1929.json
Data has been saved to Act No. 3658, December 07, 1929.json
Data has been saved to Act No. 3666, December 07, 1929.json
Data has been saved to Act No. 3665, December 07, 1929.json
Data has been saved to Act No. 3664, December 07, 1929.json
Data has been saved to Act No. 3663, Dec

Data has been saved to Act No. 3469, December 07, 1928.json
Data has been saved to Act No. 3470, December 07, 1928.json
Data has been saved to Act No. 3471, December 07, 1928.json
Data has been saved to Act No. 3472, December 07, 1928.json
Data has been saved to Act No. 3473, December 07, 1928.json
Data has been saved to Act No. 3474, December 07, 1928.json
Data has been saved to Act No. 3475, December 07, 1928.json
Data has been saved to Act No. 3476, December 07, 1928.json
Data has been saved to Act No. 3478, December 07, 1928.json
Data has been saved to Act No. 3479, December 07, 1928.json
Data has been saved to Act No. 3480, December 07, 1928.json
Data has been saved to Act No. 3481, December 07, 1928.json
Data has been saved to Act No. 3482, December 07, 1928.json
Data has been saved to Act No. 3483, December 07, 1928.json
Data has been saved to Act No. 3484, December 07, 1928.json
Data has been saved to Act No. 3477, December 07, 1928.json
Data has been saved to Act No. 3459, Dec

Data has been saved to Act No. 3334, December 07, 1926.json
Data has been saved to Act No. 3330, December 07, 1926.json
Data has been saved to Act No. 3329, December 07, 1926.json
Data has been saved to Act No. 3328, December 07, 1926.json
Data has been saved to Act No. 3340, December 07, 1926.json
Data has been saved to Act No. 3341, December 07, 1926.json
Data has been saved to Act No. 3327, December 06, 1926.json
Data has been saved to Act No. 3326, December 04, 1926.json
Data has been saved to Act No. 3325, December 04, 1926.json
Data has been saved to Act No. 3324, December 04, 1926.json
Data has been saved to Act No. 3323, December 04, 1926.json
Data has been saved to Act No. 3322, December 04, 1926.json
Data has been saved to Act No. 3321, December 04, 1926.json
Data has been saved to Act No. 3320, December 04, 1926.json
Data has been saved to Act No. 3319, December 04, 1926.json
Data has been saved to Act No. 3318, December 04, 1926.json
Data has been saved to Act No. 3317, Dec

KeyboardInterrupt: 