In [14]:
%pip install -qU langchain-community beautifulsoup4
%pip install markdownify
%pip install psycopg2 

from langchain_community.document_loaders import RecursiveUrlLoader # Loads documents recrusively from URLS
from langchain_community.document_transformers import MarkdownifyTransformer # Converts documents to Markdown
import asyncio # Library for writing concurrent code using async/await syntax
from bs4 import BeautifulSoup # Library for parsing HTML and XML docs
import re # Regular expression matching operations
import psycopg2
import urllib.robotparser
import logging # Library for configuring and implementing logging

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Configure logging to track what happens during execution of code
# and discover issues. 

logging.basicConfig(
    level=logging.INFO,   # Set minimum logging level to INFO (logs this level and above will be captured)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  # Set the log message format
    filename='app.log',  # Define filename where logs written
    filemode='a'  # Set mode to append ('w' for overwrite)
)

# Get the logger named 'ExampleLogger' for tracking application events.
logger = logging.getLogger('ExampleLogger')

# Log an example informational message.
logger.info('This is an informational message.')

# Read and print the log file to verify the logging output.
with open('app.log', 'r') as file:
    content = file.read()
    print(content)


2024-09-06 14:40:11,420 - ExampleLogger - INFO - This is an informational message.
2024-09-06 15:24:37,397 - ExampleLogger - INFO - This is an informational message.



In [16]:
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://wigan.gov.uk/robots.txt")
rp.read()

url = "https://wigan.gov.uk/some-page"
user_agent = 'MyWebScraper'
can_fetch = rp.can_fetch(user_agent, url)

print("Can fetch:", can_fetch)

Can fetch: True


In [8]:
# Initialize a RecursiveUrlLoader to fetch and process web pages recursively.
loader = RecursiveUrlLoader(
    url="https://wigan.gov.uk",  # Set the starting URL for recursive loading
    base_url="https://wigan.gov.uk",  # Set the base URL to resolve relative links
    max_depth=2,  # Set maximum number of levels to recurse into URL structure
    use_async=True,  # Enable asynchronous operations to improve performance
    timeout=300,  # Timeout after 300 seconds for each request
    check_response_status=True,  # Only process requests that return successful HTTP statuses
    prevent_outside=True,  # Restrict navigation to URLs outside the base domain.
    # extractor=None,  # Define a custom function to extract content from pages (not used).
    # metadata_extractor=None,  # Define a function to extract metadata (not used).
    # exclude_dirs=(),  # Specify directory paths to exclude from processing (not used).
    # continue_on_failure=True,  # Continue processing other URLs if one fails (not used).
    # base_url=None,  # Optionally override the base URL for relative link resolution (not used).
)

# Load webpage
docs = await loader.aload()

# Open a file to write the processed documents
with open('docs.txt', 'w') as f:
    # Iterate through each document
    for doc in docs:
        # Write each document to the file followed by a separator line
        f.writelines(str(doc))
        f.writelines('\n--------------------------------------------------------------\n')

In [9]:
# Initialize the Markdown transformer
md = MarkdownifyTransformer()

# Transform loaded documents to Markdown
md_docs = md.transform_documents(docs)

# Open a file to append the processed documents
with open('docs.txt', 'a') as f:
    # Iterate through each document
    for doc in md_docs:
        # Write each document to the file followed by a separator line
        f.writelines(str(doc))
        f.writelines('\n--------------------------------------------------------------\n')


In [10]:
md = MarkdownifyTransformer(strip=["h1", "a"])

# Transform loaded documents to Markdown
md_docs_2 = md.transform_documents(docs)

# Open a file to append the processed documents
with open('docs.txt', 'a') as f:
    # Iterate through each document
    for doc in md_docs_2:
        # Write each document to the file followed by a separator line
        f.writelines(str(doc))
        f.writelines('\n--------------------------------------------------------------\n')

In [19]:
def bs4_extractor(html: str) -> str:
    """
    Extracts and cleans text from HTML content.
    
    This function uses BeautifulSoup to parse HTML content, removes elements that don't 
    contain relevant text, then extracts and cleans the text to improve readability for downstream LLMs.
    
    Parameters:
        html (str): A string containing HTML content.
        
    Returns:
        str: The cleaned and extracted text from the HTML.
    """
    
    # Initialise a BeautifulSoup object with lxml parser
    soup = BeautifulSoup(html, "lxml")

    # Remove unnecessary elements
    for tag in soup(["header", "nav", "footer", "aside", "script", "style", "noscript"]):
        tag.decompose()  # Remove the tag from the soup object

    # Extract text from the modified HTML tree, using newline as a separator
    text = soup.get_text(separator="\n")
    
    # Clean up the text by replacing multiple newlines with a single pair of newlines
    cleaned_text = re.sub(r"\n\n+", "\n\n", text).strip()
    
    return cleaned_text

In [None]:
def bs4_extractor_2(html: str) -> str:
    """
    Extracts and cleans text from HTML content more extensively.
    
    This function uses BeautifulSoup to parse HTML content, removes elements that 
    don't contain relevant text, extracts attributes from multimedia elements, and
    cleans the text to improve readability for downstream LLMs.
    
    Parameters:
        html (str): A string containing HTML content.
        
    Returns:
        str: The cleaned, extracted, and summarized text from the HTML.
    """
    
    # Initialize a BeautifulSoup object with lxml parser
    soup = BeautifulSoup(html, "lxml")

    # Remove script, style, and other irrelevant elements
    for tag in soup(["script", "style", "noscript", "iframe", "footer", "header", "nav", "aside"]):
        tag.decompose()
    
    # Extract useful attributes from images, videos, and audios if present
    for img in soup.find_all("img"):
        alt_text = img.get('alt')
        if alt_text:
            img.replace_with(f"Image: {alt_text}\n")
        else:
            img.decompose()

    # Remove unnecessary attributes from all elements
    for tag in soup.find_all(True):
        tag.attrs = {}
    
    # Extract text, ensuring logical separation of sections and headers
    text = []
    for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'article', 'section']):
        elem_text = elem.get_text(separator="\n", strip=True)
        if elem_text:
            if elem.name.startswith('h'):
                elem_text = f"\n{elem_text}\n"  # Add newlines around headers for clarity
            text.append(elem_text)
    
    # Combine text and replace multiple newlines and whitespace
    combined_text = "\n".join(text)
    cleaned_text = re.sub(r'\s*\n+\s*', '\n', combined_text).strip()

    return cleaned_text

In [26]:
async def main():
    """
    Main asynchronous function to process and convert documents 
    to Markdown format, then save to a file.
    """
    # Initialize the Markdown transformer
    md = MarkdownifyTransformer()
    
    # Asynchronously load documents
    docs = await loader.aload()
    
    # Transform loaded documents to Markdown
    docs = md.transform_documents(docs)
    
    # Print the number of documents processed
    print("Number of documents processed:", len(docs))
    # Print the second document as a sample output
    print("Sample document:", '\n', docs[0])
    
    # Open a file to write the processed documents
    with open('docs.txt', 'w') as f:
        # Iterate through each document
        for doc in docs:
            # Write each document to the file followed by a separator line
            f.writelines(str(doc))
            f.writelines('\n--------------------------------------------------------------\n')


In [27]:
await main()


Number of documents processed: 1
Sample document: 
 page_content='Wigan Council \- Residents

Browser does not support script.

Browser does not support script.

[Skip to content](#L3_MainContentPlaceholder)

  [Login](https://forms.wigan.gov.uk) 

[Wigan Council](/index.aspx)

Search

Search

* [Resident](/index.aspx)
* [Business](/Business/index.aspx "index")
* [Council](/Council/index.aspx "index")
* [MyAccount](/MyAccount/My-Account.aspx)
* [Visit Us](https://www.visitwigan.com/index.aspx "Visit Wigan")

[Emergency contact](https://www.wigan.gov.uk/Resident/Crime-Emergencies/Emergencies.aspx "Emergency contact")

# Wigan Council \- Residents

* [Bins and recycling](#tabs-services-1)
* [Planning and building control](#tabs-services-2)
* [Council Tax](#tabs-services-3)
* [Jobs and skills](#tabs-services-4)

* [Bin collection dates](https://apps.wigan.gov.uk/MyNeighbourhood/ "Bin collection dates")
* [Report a missed bin](/Resident/Bins-Recycling/Missed-bins.aspx "Report a missed bin"

In [9]:
# Entry point of the script
if __name__ == '__main__':
    # Run the asynchronous main function
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop