In [13]:
import requests
from bs4 import BeautifulSoup
import os

In [14]:
ICIRR_URL = 'https://www.icirr.org/'

In [15]:
def scrape_urls(url, depth):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if request was unsuccessful
    if response.status_code != 200:
        print("Failed to retrieve page. Status code:", response.status_code)
        return set()

    # Check if depth is 0, return an empty set
    if depth == 0:
        return set()

    result = set()
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find and extract information you need from the page
    # For example, find all links on the page
    links = soup.find_all('a')
    for link in links:
        # Get the URL of the link
        href = str(link.get('href'))
        if not is_valid_url(href):
            continue
        result.add(href)
        # If the link is within the same domain, crawl it recursively
        if depth > 1:
            result.update(scrape_urls(href, depth - 1))
    return result

def is_valid_url(url):
    if not url:
        return False
    if "icirr.org" not in url:
        return False
    if "mailto:" in url:
        return False
    if "ICIRR-Board" in url:
        return False
    
def save_urls(urls):
    # Save the URLs to a file
    with open(os.path.join('raw_files', 'urls.txt'), 'w', encoding='utf-8') as f:
        for url in urls:
            f.write(url + '\n')
    
document_types = ['pdf', 'docx', 'doc', 'ppt', 'pptx']

def save_page(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if request was unsuccessful
    if response.status_code != 200:
        print("Failed to retrieve page. Status code:", response.status_code)
        return
    if response.headers['Content-Type'] == 'application/pdf':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.pdf"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/msword':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.doc"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.docx"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.ms-powerpoint':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.ppt"), 'wb') as f:
            f.write(response.content)
        return
    if response.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
        title = positive_hash(url)
        with open(os.path.join('raw_files', f"{title}.pptx"), 'wb') as f:
            f.write(response.content)
        return
        
    # Get the title of the webpage
    title = positive_hash(url)
    
    # Create a directory to store pages if it doesn't exist
    if not os.path.exists('raw_files'):
        os.makedirs('raw_files')

    # Save the page to a file
    with open(os.path.join('raw_files', f"{title}.html"), 'w', encoding='utf-8') as f:
        f.write(response.text)

def load_urls():
    urls = set()
    # Load the URLs from the file
    with open('urls.txt', 'r') as f:
        for line in f:
            urls.add(line.strip())
    return urls

def positive_hash(obj):
    """
    Custom hash function that always returns non-negative integers.
    """
    h = hash(obj)
    
    # Ensure that the hash is non-negative
    # If hash is negative, convert it to positive using bitwise AND with 64 bit mask
    return h & 0xFFFFFFFFFFFFFFFF if h < 0 else h

In [16]:
# scrape the pages
urls = load_urls()
for url in urls:
    save_page(url)
print("Done scraping pages")

Failed to retrieve page. Status code: 404
Done scraping pages
