In [318]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer, Html2TextTransformer
import re
from tqdm import tqdm
import pandas as pd
import os
import pandas as pd
import boto3

In [320]:
def extract_urls(base_url, text):
    # Regular expression to find URLs in parentheses
    pattern = re.compile(r'\((.*?)\)')

    # Extract all URLs
    urls = pattern.findall(text)

    # if any urls start with a /, add the base_url
    urls = [urljoin(base_url, url) if url.startswith('/') else url for url in urls]

    # Remove any urls that don't start with http
    urls = [url for url in urls if url.startswith('http')]

    # remove duplicate urls
    urls = list(set(urls))

    return urls


def check_if_link_in_base_domain(base_url, link):
    """ checks if a link is in the same domain as the base url """
    
    if link.startswith(base_url):
        return link

    elif not link.startswith("http"):
        return f"{base_url}{link}"
    
    else:
        return False

### New functions

In [321]:
import html2text

In [322]:
# NEW FUNCTION - SINGLE PAGE

def scrape_specific_page(url):
    """ takes a url, and returns both the html and any links on the main section of the page """

    bs_transformer = BeautifulSoupTransformer()

    loader = AsyncHtmlLoader(url)
    docs = loader.load()

    soup = BeautifulSoup(docs[0].page_content)

    # grab links from main section only
    if soup.find("div", id=["main-content", "cads-main-content"]):
        main_section_html = soup.find("div", id=["main-content", "cads-main-content"])
    elif soup.find("div", class_=["main-content", "cads-main-content"]):
        main_section_html = soup.find("div", class_=["main-content", "cads-main-content"])
    else:
        main_section_html = soup

    extracted_links = bs_transformer.extract_tags(str(main_section_html), ['a'])
    links_list = extract_urls(url, extracted_links)

    site_to_html = html2text.html2text(str(main_section_html))

    return links_list, site_to_html

In [323]:
# NEW FUNCTION - MULTIPLE PAGES

def scrape_url_list(base_url, url_list, cookies=None):
    """ takes a list of urls, and returns a dataframe of markdown and urls, as well as any links found as a list"""

    bs_transformer = BeautifulSoupTransformer()

    if cookies:
        loader = AsyncHtmlLoader(url_list, header_template=cookies)
    else:
        loader = AsyncHtmlLoader(url_list)
    docs = loader.load()

    pages = []
    links = []

    for page in tqdm(docs):

        current_url = page.metadata['source']
        
        # get main section of page
        soup = BeautifulSoup(page.page_content)

        if url_list == [base_url]:  # for base url (homepage), use whole page to get all links
            main_section_html = soup
        else:
            if soup.find("div", id=["main-content", "cads-main-content"]):
                main_section_html = soup.find("div", id=["main-content", "cads-main-content"])
            elif soup.find("div", class_=["main-content", "cads-main-content"]):
                main_section_html = soup.find("div", class_=["main-content", "cads-main-content"])
            else:
                main_section_html = soup
            
        # get links on main section of page
        extracted_links = bs_transformer.extract_tags(str(main_section_html), ['a'])

        # run extract_url on each url
        current_page_links = extract_urls(current_url, extracted_links)
    
        # add links if in base domain
        current_page_links = [link for link in current_page_links if check_if_link_in_base_domain(base_url, link)]

        # add current page links to the link list
        links += current_page_links
        
        # remove duplicate links
        links = list(set(links))

        # page content
        current_page_markdown = html2text.html2text(str(main_section_html))
        page_dict = {
            "source_url": current_url,
            "markdown": current_page_markdown
        }
        pages.append(page_dict)

    # Create a dataframe with page sources & contents
    document_df = pd.DataFrame(pages)

    unique_pages = document_df.drop_duplicates(subset=['source_url']).reset_index(drop=True)
    
    print(f"Number of pages scraped: {len(pages)}")
    
    return unique_pages, links

In [324]:
df, links = scrape_url_list(base_url="https://www.citizensadvice.org.uk", url_list=["https://www.citizensadvice.org.uk/benefits/benefits-introduction/what-benefits-can-i-get/", "https://www.citizensadvice.org.uk/decision-trees/scams/"], cookies=None)

Fetching pages: 100%|##########| 2/2 [00:00<00:00,  6.45it/s]
100%|██████████| 2/2 [00:00<00:00, 25.50it/s]

Number of pages scraped: 2





In [325]:
url = "https://www.citizensadvice.org.uk/advisernet/consumer/buying-or-repairing-a-car/buying-a-used-car/"
cookie_args={'Cookie': '.CitizensAdviceLogin=' + "D1D4629CC80A93D71B6D36E039F6E0F6A3F905099CE0E56B1F6FC56557E5324EEE5E3ADBB10AA1C5F7BF1EA945F4ECDDD5DE61469064CBFAF83F64588F4ED4DC9924B4621FE9D74111EB8992A851F69212769D78F22DC5DF83E054B87491E2B0E75EE9B0EB6DAE8824E8DAA10C9C428792C4E468"}
df, links = scrape_url_list(base_url="https://www.citizensadvice.org.uk/advisernet/", url_list=url, cookies=cookie_args)

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.17it/s]
100%|██████████| 1/1 [00:00<00:00, 17.46it/s]

Number of pages scraped: 1





In [326]:
links

['https://www.citizensadvice.org.uk/advisernet/consumer/buying-or-repairing-a-car/the-car-you-bought-is-still-on-hire-purchase/',
 'https://www.citizensadvice.org.uk/advisernet/consumer/#h-buying-or-repairing-a-car',
 'https://www.citizensadvice.org.uk/advisernet/consumer/somethings-gone-wrong-with-a-purchase/getting-your-money-back-if-you-paid-by-card-or-paypal/',
 'https://www.citizensadvice.org.uk/advisernet/consumer/get-more-help/if-you-need-more-help/',
 'https://www.citizensadvice.org.uk/advisernet/consumer/buying-or-repairing-a-car/motor-industry-trade-associations-and-useful-contacts/',
 'https://www.citizensadvice.org.uk/advisernet/consumer/buying-or-repairing-a-car/problems-with-a-used-car/',
 'https://www.citizensadvice.org.uk/advisernet/consumer/buying-or-repairing-a-car/problems-with-a-car-repair/']