In [15]:
!pip install -qU langchain-community beautifulsoup4
!pip install markdownify
%pip install psycopg2 

from langchain_community.document_loaders import RecursiveUrlLoader # Loads documents recrusively from URLS
from langchain_community.document_transformers import MarkdownifyTransformer # Converts documents to Markdown
import asyncio # Library for writing concurrent code using async/await syntax
from bs4 import BeautifulSoup # Library for parsing HTML and XML docs
import re # Regular expression matching operations
import psycopg2

import logging # Library for configuring and implementing logging

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Configure logging to track what happens during execution of the following code
# and discover issues. 

logging.basicConfig(
    level=logging.INFO,   # Set minimum logging level to INFO (logs this level and above will be captured)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  # Set the log message format
    filename='app.log',  # Define filename where logs written
    filemode='a'  # Set mode to append ('w' for overwrite)
)

# Get the logger named 'ExampleLogger' for tracking application events.
logger = logging.getLogger('ExampleLogger')

# Log an example informational message.
logger.info('This is an informational message.')

# Read and print the log file to verify the logging output.
with open('app.log', 'r') as file:
    content = file.read()
    print(content)


2024-09-04 16:14:17,732 - ExampleLogger - INFO - This is an informational message.
2024-09-04 16:18:18,874 - ExampleLogger - INFO - This is an informational message.
2024-09-04 19:04:21,000 - ExampleLogger - INFO - This is an informational message.
2024-09-04 19:49:25,168 - ExampleLogger - INFO - This is an informational message.



In [7]:
# Initialize a RecursiveUrlLoader to fetch and process web pages recursively.
loader = RecursiveUrlLoader(
    url="https://wigan.gov.uk",  # Set the starting URL for recursive loading
    base_url="https://wigan.gov.uk",  # Set the base URL to resolve relative links
    max_depth=2,  # Set maximum number of levels to recurse into URL structure
    use_async=True,  # Enable asynchronous operations to improve performance
    timeout=300,  # Timeout after 300 seconds for each request
    check_response_status=True,  # Only process requests that return successful HTTP statuses
    prevent_outside=True,  # Restrict navigation to URLs outside the base domain.
    # extractor=None,  # Define a custom function to extract content from pages (not used).
    # metadata_extractor=None,  # Define a function to extract metadata (not used).
    # exclude_dirs=(),  # Specify directory paths to exclude from processing (not used).
    # continue_on_failure=True,  # Continue processing other URLs if one fails (not used).
    # base_url=None,  # Optionally override the base URL for relative link resolution (not used).
)

In [5]:
def bs4_extractor(html: str) -> str:
    """
    Extracts and cleans text from HTML content.
    
    This function uses BeautifulSoup to parse HTML content, removes elements that don't 
    contain relevant text, then extracts and cleans the text to improve readability for downstream LLMs.
    
    Parameters:
        html (str): A string containing HTML content.
        
    Returns:
        str: The cleaned and extracted text from the HTML.
    """
    
    # Initialise a BeautifulSoup object with lxml parser
    soup = BeautifulSoup(html, "lxml")

    # Remove unnecessary elements
    for tag in soup(["header", "nav", "footer", "aside", "script", "style", "noscript"]):
        tag.decompose()  # Remove the tag from the soup object

    # Extract text from the modified HTML tree, using newline as a separator
    text = soup.get_text(separator="\n")
    
    # Clean up the text by replacing multiple newlines with a single pair of newlines
    cleaned_text = re.sub(r"\n\n+", "\n\n", text).strip()
    
    return cleaned_text

In [8]:
async def main():
    """
    Main asynchronous function to process and convert documents 
    to Markdown format, then save to a file.
    """
    # Initialize the Markdown transformer
    md = MarkdownifyTransformer()
    
    # Asynchronously load documents
    docs = await loader.aload()
    
    # Transform loaded documents to Markdown
    docs = md.transform_documents(docs)
    
    # Print the number of documents processed
    print("Number of documents processed:", len(docs))
    # Print the second document as a sample output
    print("Sample document:", docs[1])
    
    # Open a file to write the processed documents
    with open('docs.txt', 'w') as f:
        # Iterate through each document
        for doc in docs:
            # Write each document to the file followed by a separator line
            f.writelines(str(doc))
            f.writelines('\n--------------------------------------------------------------\n')


In [12]:
await main()


  for attr in list(attrs.keys()):


Number of documents processed: 121
Sample document: page_content='Adult Social Care and Health

Browser does not support script.

Browser does not support script.

[Skip to content](#L3_MainContentPlaceholder)

  [Login](https://forms.wigan.gov.uk) 

[Wigan Council](/index.aspx)

Search

Search

* [Resident](/index.aspx)
* [Business](/Business/index.aspx "index")
* [Council](/Council/index.aspx "index")
* [MyAccount](/MyAccount/My-Account.aspx)
* [Visit Us](https://www.visitwigan.com/index.aspx "Visit Wigan")

[Emergency contact](https://www.wigan.gov.uk/Resident/Crime-Emergencies/Emergencies.aspx "Emergency contact")

[Home](/index.aspx "Go to Home from here")/[Resident](/Resident/index.aspx "Go to Resident from here")/[Health and social care](/Resident/Health-Social-Care/index.aspx "Go to Health and social care from here")/Adults

# Adult Social Care and Health

![What is adult social care?](/Images/Resident/Health-Social-Care/Adults/Adult-Social-Care-and-Health-banner.png)## Banner


In [9]:
# Entry point of the script
if __name__ == '__main__':
    # Run the asynchronous main function
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop