In [1]:
from bs4 import BeautifulSoup
import requests
from collections import OrderedDict
from urllib.parse import urljoin, urlparse
import datetime
from tika import parser
import uuid
import os
from readability import Document
import re

In [2]:
start_url = "https://www.england.nhs.uk/coronavirus/"

whitelist = [".gov.uk", ".nhs.uk"]
# only fetch URLs from these domains

depth = 2

In [3]:
encountered = set()

In [4]:
def sstrip(raw):
    st = re.sub("\s\s+", "\n", raw)
    return st.strip()
    

In [7]:
def get_file(url, out_path):
    r = requests.get(url, stream=True) 
    with open(out_path, "wb") as f: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk: 
                 f.write(chunk)

In [5]:
def get_pdf(pdf_url, out_path):
    r = requests.get(pdf_url, stream = True) 
    with open(out_path,"wb") as pdf: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk: 
                 pdf.write(chunk)


In [6]:
dataset = []

stack = set([(start_url, "NHS Coronavirus Startpage", 0)])

while stack:
    
    
    url, link_title, level = stack.pop()
    
    print(url)

    if url.endswith('.pptx'):        
        continue


    if url.endswith('.pdf'):
        
        filename = f"{uuid.uuid4()}.pdf"
        get_pdf(url, os.path.join('answers', filename))
        
        parsed = parser.from_file(os.path.join('answers', filename))

        dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
                                    ("url", url),
                                    ("title", link_title),
                                    ("format", "pdf"),
                                    ("text", parsed['content'])]))
    else:
        try:
            raw_html = requests.get(url).text
        except (requests.exceptions.InvalidSchema, requests.exceptions.MissingSchema):
            print("unable to obtain url {url}")

        readable_html = Document(raw_html).content()
        soup = BeautifulSoup(readable_html, 'lxml')
        links = soup.find_all('a')

        # first parse out links

        for link in links:

            href = link.get('href')            
            href = urljoin(url, href) # calculate relative references

            domain = urlparse(href).netloc
            if not any((domain.endswith(urlend) for urlend in whitelist)):
                continue


            if href is None:
                continue
            elif href in encountered:
                continue

            encountered.add(href)



            if href.startswith("#"):
                continue
            else:            
                if level < depth:
                    stack.add((href, link.get('text'), level+1))

        # save the text

        parsed = parser.from_buffer(raw_html)
            
        
        dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
                                    ("url", url),
                                    ("title", sstrip(parsed.get('title', ''))),
                                    ("format", "html"),
                                    ("text", sstrip(parsed['content']))]))


https://www.england.nhs.uk/coronavirus/


  raw_html = str_(tostring(doc.body or doc))


https://www.nhs.uk/
https://www.england.nhs.uk/coronavirus/returning-clinicians/
https://www.nhs.uk/apps-library/hoop/
https://www.england.nhs.uk/coronavirus/publication/visitor-guidance/
https://www.nhs.uk/live-well/
https://www.nhs.uk/apps-library/my-house-memories/
https://www.nhs.uk/conditions/cancer/
https://www.england.nhs.uk/coronavirus/publication/letter-cancer-alliance-information-on-managing-cancer-referrals/
https://www.nhs.uk/oneyou/every-mind-matters/your-mind-plan-quiz/


TypeError: expected string or bytes-like object

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df

In [None]:
import tika


In [147]:
parsed = parser.from_buffer(raw_html)

"\nRegister your details - NHS Organ Donation\nNHSBT uses cookies which are essential for the site to work.\nWe also use non-essential cookies to help us improve our services, any data collected is anonymised.\nBy continuing to use this website you agree to our use of cookies. Read more about our cookies\nOK\nHi there, we see you're using OS, why not try our app?\nDownload\nAuxiliary nav\nWho we are\nWhat we do\nHow we help\nHow you can help\nCareers\nNews\nMenu\nRegister\nSearch\nSearch\nSkip to main content\nHome\nRegister your decision\ntoggle next navigation level\nRegister your decision\nRegister to donate\nAmend your details\nRefuse to donate\nWithdraw your details\nHelping you to decide\ntoggle next navigation level\nHelping you to decide\nAbout organ donation\nWhat can you donate?\nWho can donate?\nGet the facts\nLiving donation\nTissue donation\nStatistics\nFAQ\nAbout your choices\nUK laws\nYour faith and beliefs\nWhy ethnicity matters\nReal life stories\nTell your family and 

In [146]:
df

Unnamed: 0,date,url,title,format,text
0,2020-03-20,https://www.england.nhs.uk/coronavirus/,,html,"[b', \n , \n , Skip to main content,..."
1,2020-03-20,https://www.nhs.uk/conditions/coronavirus-covi...,,html,"[b', \n \n\n \n , Skip to main cont..."
2,2020-03-20,https://www.england.nhs.uk/coronavirus/primary...,,html,"[b', \n , \n , Skip to main content,..."
3,2020-03-20,https://www.nhs.uk/using-the-nhs/nhs-services/...,,html,"[b', \n \n\n \n , Skip to main cont..."
4,2020-03-20,https://www.england.nhs.uk/coronavirus/primary...,,html,"[b', \n , \n , Skip to main content,..."
...,...,...,...,...,...
226,2020-03-20,https://www.gov.uk/guidance/wuhan-novel-corona...,,html,"[b', \n \n\n \n\n , \n , \n ..."
227,2020-03-20,https://www.gov.uk/government/news/health-secr...,,html,"[b', \n \n\n \n\n , \n , \n ..."
228,2020-03-20,https://www.nhs.uk/NHSEngland/Healthcareabroad...,,html,"[b', \n \n\n \n , Skip to main cont..."
229,2020-03-20,https://www.england.nhs.uk/accessibility/#main...,,html,"[b', \n , \n , Skip to main content,..."
