In [1]:
from bs4 import BeautifulSoup
import requests
from collections import OrderedDict
from urllib.parse import urljoin, urlparse
import datetime
from tika import parser
import uuid
import os
from readability import Document
import re
import pandas as pd

In [9]:
start_url = "https://www.england.nhs.uk/coronavirus/"

whitelist = [".gov.uk", ".nhs.uk"]
# only fetch URLs from these domains

depth = 2

In [10]:
encountered = set()

In [11]:
def sstrip(raw):
    if not isinstance(raw, str):
        return ""
    st = re.sub("\s\s+", "\n", raw)
    return st.strip()
    

In [12]:
def get_file(url, out_path):
    r = requests.get(url, stream=True) 
    with open(out_path, "wb") as f: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk: 
                 f.write(chunk)

In [13]:
def is_html(parsed):
    try:
        assert parsed['metadata']['Content-Type'].startswith('text/html')
        return True
    except:
        return False

In [14]:
dataset = []

stack = set([(start_url, "NHS Coronavirus Startpage", 0)])

while stack:
        
    url, link_title, level = stack.pop()    
    print(url)    
    stem, ext = os.path.splitext(url)    
    filename = f"{uuid.uuid4()}{ext}"    
    get_file(url, os.path.join('answers', filename))    
    parsed = parser.from_file(os.path.join('answers', filename))
        
    dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
                                ("url", url),
                                ("title", parsed.get('title', '')),
                                ("format", parsed['metadata']['Content-Type']),
                                ("text", sstrip(parsed['content']))]))
    
    
    if is_html(parsed):
        
        with open(os.path.join('answers',filename), 'r') as f:
            soup = BeautifulSoup(f)
        links = soup.find_all('a')


        for link in links:

            href = link.get('href')            
            href = urljoin(url, href) # calculate relative references

            domain = urlparse(href).netloc
            if not any((domain.endswith(urlend) for urlend in whitelist)):
                continue


            if href is None:
                continue
            elif href in encountered:
                continue

            encountered.add(href)



            if href.startswith("#"):
                continue
            else:            
                if level < depth:
                    stack.add((href, link.get('text'), level+1))

#         # save the text

#         parsed = parser.from_buffer(raw_html)
            
        
#         dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
#                                     ("url", url),
#                                     ("title", sstrip(parsed.get('title', ''))),
#                                     ("format", "html"),
#                                     ("text", sstrip(parsed['content']))]))


https://www.england.nhs.uk/coronavirus/
https://www.gov.uk/government/publications/covid-19-stay-at-home-guidance/stay-at-home-guidance-for-people-with-confirmed-or-possible-coronavirus-covid-19-infection
https://www.england.nhs.uk/privacy-policy/
https://www.gov.uk/browse/education
https://www.gov.uk/help
https://www.gov.uk/search/services
https://www.england.nhs.uk/coronavirus/publication/letter-cancer-alliance-information-on-managing-cancer-referrals/
https://www.england.nhs.uk/coronavirus/feedback/
https://www.england.nhs.uk/coronavirus/publication/?filter-publication=letter
https://www.england.nhs.uk/ourwork/commissioning2/
https://www.gov.uk/search/news-and-communications
https://www.gov.uk/government/publications/covid-19-stay-at-home-guidance/stay-at-home-guidance-for-people-with-confirmed-or-possible-coronavirus-covid-19-infection#attachment-4077140-accessibility-request
http://www.england.nhs.uk/coronavirus/
https://www.england.nhs.uk/
https://www.england.nhs.uk/privacy-polic

https://www.nhs.uk/conditions/pregnancy-and-baby/
https://improvement.nhs.uk/home/
https://www.nhs.uk/live-well/exercise/couch-to-5k-week-by-week/
https://www.nhs.uk/live-well/
https://www.gov.uk/search/policy-papers-and-consultations?content_store_document_type%5B%5D=open_consultations&amp;content_store_document_type%5B%5D=closed_consultations
https://www.gov.uk/browse/births-deaths-marriages
https://www.nhs.uk/live-well/eat-well/
https://www.england.nhs.uk/coronavirus/publication/?filter-publication=guidance
https://www.nhs.uk/accessibility/
https://www.gov.uk/government/publications/covid-19-stay-at-home-guidance/stay-at-home-guidance-for-people-with-confirmed-or-possible-coronavirus-covid-19-infection#attachment-4077133-accessibility-request
https://www.england.nhs.uk/coronavirus/wp-content/uploads/sites/52/2020/03/visitor-guidance-16-march-2020.pdf
https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/874002/Stay_at_home_guidance_for_house

https://www.england.nhs.uk/coronavirus/wp-content/uploads/sites/52/2020/03/letter-from-prof-powis-to-ros-and-mds-19-march-2020.pdf
https://www.gov.uk/guidance/wuhan-novel-coronavirus-information-for-the-public#what-to-do-if-you-have-symptoms
https://www.nhs.uk/our-policies/governance-of-the-nhs-website/
https://www.health-ni.gov.uk/Covid-19-returning-professionals
https://www.england.nhs.uk/coronavirus/primary-care/community-pharmacy/
https://www.nhs.uk/oneyou/every-mind-matters/your-mind-plan-quiz/
http://www.nationalarchives.gov.uk/doc/open-government-licence/open-government-licence.htm
https://www.nhs.uk/our-policies/requests-for-research-support/
https://www.gov.uk/guidance/wuhan-novel-coronavirus-information-for-the-public#history
https://www.gov.uk/guidance/wuhan-novel-coronavirus-information-for-the-public#contents
https://www.nhs.uk/our-policies/linking-from-the-nhs-website/
https://www.england.nhs.uk/coronavirus/wp-content/uploads/sites/52/2020/03/Adaptations-to-the-NHS-Diabet

In [15]:
df = pd.DataFrame(dataset)

In [None]:
df.to_csv('data/')

In [16]:
df.format.values.tolist()

['text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'application/pdf',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'text/html; charset=UTF-8',
 'application/pdf',
 'text/html; charset=UTF-8',
 'application/pdf',
 'application/pdf',
 'application/pdf',
 

In [None]:
import tika


In [34]:
is_html(parsed )

True

In [8]:
parsed.
    

{'metadata': {'Content-Encoding': 'UTF-8',
  'Content-Language': 'en-US',
  'Content-Type': 'text/html; charset=UTF-8',
  'X-Parsed-By': ['org.apache.tika.parser.DefaultParser',
   'org.apache.tika.parser.html.HtmlParser'],
  'X-TIKA:content_handler': 'ToTextContentHandler',
  'X-TIKA:embedded_depth': '0',
  'X-TIKA:parse_time_millis': '2',
  'author': 'Coronavirus',
  'cleartype': 'on',
  'dc:title': 'Coronavirus',
  'description': 'Health and high quality care for all, now and for future generations',
  'resourceName': "b'c72bb3ae-fff4-4ecc-8d08-6649903ec4ca'",
  'title': 'Coronavirus',
  'viewport': 'width=device-width, initial-scale=1.0'},
 'content': "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCoronavirus\n\n\n    \n        Skip to main content\n    \n\n\n    \n\n    \n        Cookies on the NHS England and NHS Improvement website\n\n\n        \n                        We’ve put some small files called cookies on your device to make our site work.\n\nWe’d also like to

In [11]:
is_html(parsed)

True