# Load web content from a url for clean up

This notebook takes a starting URL which is a section page, looks for the summmary links and then crawls that content.

In [1]:
# import necessary libraries 
from bs4 import BeautifulSoup
import requests
import re 
from pathlib import Path

## Function to load the start page

In [2]:
def get_soup_from_url(url: str) :
    response = requests.get(url) 
    html = response.text 
    soup = BeautifulSoup(html, 'html.parser')
    print(soup.h1.text)
    return soup

## Function to get the summary links web part

In [3]:
def get_summary_links(soup: BeautifulSoup) :
    # Get the outer DIV tag
    summary_links_container = soup.find_all('div', class_ = 'slm-layout-main groupmarker')
    # Find each of the unordered lists
    summary_links = summary_links_container[0].find_all('ul', class_ = 'list list-nav')
    return summary_links

## Process all the content
Look for section pages but for now skip them (set by the flag).

In [4]:
# Set up the root section
url_root = 'https://www.leeds.gov.uk/antisocial-behaviour-and-crime'
# Don't follow second level sections
follow_section = False

### Function to write scraped HTML to file

In [14]:
def save_content(soup : BeautifulSoup,  html_source_route_dir : Path, web_page_path: str) :
    try :
        # swap the slashes and add the extension
        parsed_url = web_page_path.replace('/',"\\") + '.html'

        # create a Path object
        full_path = Path(html_source_route_dir.resolve()) / parsed_url

        # create the files parents if they dont exist
        full_path.parent.mkdir(parents=True,exist_ok=True)

        # save the file
        if not full_path.exists() :
            full_path.write_text(soup.prettify(), encoding='utf-8')
        
    except Exception as ex: 
        print(f'something went wrong loading file {full_path}', ex)
        print('--------------')


In [None]:
# relative folder to save html source code.
html_source_root_dir = Path(Path().parent.resolve()) / '..' / 'test'
wpp = '/antisocial-behaviour-and-crime/making-a-noise-complaint'

save_content(html_source_root_dir, wpp.replace('/','',1))

### Process the summary links list

In [None]:
# Get the summary links from the landing page
soup = get_soup_from_url(url_root)
summary_links = get_summary_links(soup)

# relative folder to save html source code.
html_source_root_dir = Path(Path().parent.resolve()) / '..' / 'test'

# print(html_source_root_dir.resolve())

# For each unordered list
for list_of_links in summary_links :
    
    # print('Gathering content')
    # print('---------------------------')

    for a in list_of_links .find_all('a') :
        # get the a href from the summary links
        link = a.attrs['href']

        # scrape the content
        full_url = 'https://www.leeds.gov.uk' + link
        new_soup = get_soup_from_url(full_url)

        # process the summary pages differently
        is_summary = (len(new_soup.find_all('div', class_ = 'slm-layout-main groupmarker')) > 0)

        # use the link text as the basis of the folder structure
        fpath = link

        # if the link is relative then strip off the forward slash or Path will resolve to root (c:)
        if(link[0] == '/') :
            fpath = link.replace('/','',1)

        if is_summary :
            if follow_section :
                print('section - doing nothing for now')
            else :
                print('Not processing sections - skipping')
        else : 
            save_content(new_soup, html_source_root_dir, fpath)
