# Load web content from a url for clean up

This notebook takes a starting URL which is a section page, looks for the summmary links and then crawls that content.

In [10]:
# import necessary libraries 
from bs4 import BeautifulSoup
import requests
import re 
import os
from os import path

## Function to load the start page

In [11]:
def get_soup_from_url(url: str) :
    response = requests.get(url) 
    html = response.text 
    soup = BeautifulSoup(html, 'html.parser')
    print(soup.h1.text)
    return soup

## Function to get the summary links web part

In [12]:
def get_summary_links(soup: BeautifulSoup) :
    # Get the outer DIV tag
    summary_links_container = soup.find_all('div', class_ = 'slm-layout-main groupmarker')
    # Find each of the unordered lists
    summary_links = summary_links_container[0].find_all('ul', class_ = 'list list-nav')
    return summary_links

## Process all the content
Look for section pages but for now skip them (set by the flag).

In [13]:
# Set up the root section
url_root = 'https://www.leeds.gov.uk/antisocial-behaviour-and-crime'
# Don't follow second level sections
follow_section = False

In [24]:
def save_content(output_dir: str, href: str, url: str) :
    response = requests.get(url) 
    html = response.text 

    flist = href.split('/')
    filename = flist[len(flist)-1]

    destination = path.join('..', output_dir, filename + '.html')
    print(destination)
    with open(destination, "w", encoding='utf-8') as file:
        file.write(html)

In [25]:
# Get the summary links
soup = get_soup_from_url(url_root)
summary_links = get_summary_links(soup)

# For each unordered list
for list_of_links in summary_links :
    
    print('Gathering content')
    print('---------------------------')

    for a in list_of_links .find_all('a') :
        # path.join('https://', link)
        link = a.attrs['href']
        full_url = 'https://www.leeds.gov.uk' + link

        new_soup = get_soup_from_url(full_url)
        is_summary = (len(new_soup.find_all('div', class_ = 'slm-layout-main groupmarker')) > 0)
        if is_summary :
            if follow_section :
                print('section - doing nothing for now')
            else :
                print('Not processing sections - skipping')
        else : 
            save_content('test', link, full_url)


Antisocial behaviour and crime
Gathering content
---------------------------
Making a noise complaint
..\test\making-a-noise-complaint.html
Report antisocial behaviour
..\test\report-antisocial-behaviour.html
Report fly tipping
..\test\report-fly-tipping.html
Domestic violence and abuse
Not processing sections - skipping
Report graffiti
..\test\report-graffiti.html
Report a road, path, or cycle lane that needs cleaning
..\test\report-a-road-that-needs-cleaning.html
Report a hate crime or incident
..\test\report-a-hate-crime-or-incident.html
Report discarded needles or drug related waste
..\test\report-discarded-needles-or-drug-related-waste.html
Requesting CCTV
..\test\requesting-cctv.html
Report antisocial or dangerous driving
..\test\report-antisocial-or-dangerous-driving.html
Public Spaces Protection Orders
..\test\public-spaces-protection-orders.html
Report an unauthorised Gypsy or Traveller site
..\test\report-an-unauthorised-gypsy-or-traveller-site.html
Safer Leeds partnership
..