In [40]:
from bs4 import BeautifulSoup
import requests
from collections import OrderedDict
from urllib.parse import urljoin, urlparse
import datetime
from tika import parser
import uuid
import os

In [41]:
start_url = "https://www.england.nhs.uk/coronavirus/"

whitelist = [".gov.uk", ".nhs.uk"]
# only fetch URLs from these domains

depth = 2

In [42]:
encountered = set()

In [43]:
def get_pdf(pdf_url, out_path):
    r = requests.get(pdf_url, stream = True) 
    with open(out_path,"wb") as pdf: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk: 
                 pdf.write(chunk)


In [None]:
dataset = []

stack = set([(start_url, "NHS Coronavirus Startpage", 0)])

while stack:
    
    url, link_title, level = stack.pop()

    if url.endswith('.pdf'):
        
        filename = f"{uuid.uuid4()}.pdf"
        get_pdf(url, os.path.join('answers', filename))
                
        dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
                                    ("url", url),
                                    ("title", link_title),
                                    ("format", "pdf"),
                                    ("text", text)]))
    else:
        try:
            raw_html = requests.get(url).text
        except (requests.exceptions.InvalidSchema, requests.exceptions.MissingSchema):
            print("unable to obtain url {url}")

        soup = BeautifulSoup(raw_html)
        links = soup.find_all('a')

        # first parse out links

        for link in links:

            href = link.get('href')            
            href = urljoin(url, href) # calculate relative references

            domain = urlparse(href).netloc
            if not any((domain.endswith(urlend) for urlend in whitelist)):
                continue


            if href is None:
                continue
            elif href in encountered:
                continue

            encountered.add(href)



            if href.startswith("#"):
                continue
            else:            
                if level < depth:
                    stack.add((href, link.get('text'), level+1))

        # save the text

        text = soup.find_all(text=True)
        title = soup.title
        
        if title is None or title.string is None:
            title = ""
        else:
            title = title.string.strip()
        
        dataset.append(OrderedDict([("date", datetime.datetime.now().strftime("%Y-%m-%d")),
                                    ("url", url),
                                    ("title", title),
                                    ("format", "html"),
                                    ("text", text)]))


In [21]:
import pandas as pd

In [23]:
pd.DataFrame(dataset)

Unnamed: 0,date,url,title,format,text
0,2020-03-20,https://www.england.nhs.uk/coronavirus/,Coronavirus,html,"[HTML, [if lt IE 7]><html class=""no-js lt-ie9 ..."
1,2020-03-20,https://www.england.nhs.uk/coronavirus/publica...,Coronavirus » COVID-19 prioritisation within c...,html,"[HTML, [if lt IE 7]><html class=""no-js lt-ie9 ..."
2,2020-03-20,https://www.england.nhs.uk/coronavirus/publica...,Coronavirus » Updates and guidance for general...,html,"[HTML, [if lt IE 7]><html class=""no-js lt-ie9 ..."
3,2020-03-20,https://www.gov.uk/guidance/wuhan-novel-corona...,\n Number of coronavirus (COVID-19) cases...,html,"[html, [if lt IE 9]><html class=""lte-ie8"" lang..."
4,2020-03-20,https://www.gov.uk/guidance/wuhan-novel-corona...,\n Number of coronavirus (COVID-19) cases...,html,"[html, [if lt IE 9]><html class=""lte-ie8"" lang..."
...,...,...,...,...,...
253,2020-03-20,https://www.nhs.uk/Service-Search/Dentists/Loc...,Find a dentist - NHS,html,"[html, \n, \n, \n, \n, \n, \n, \n, Find a dent..."
254,2020-03-20,https://www.nhs.uk/our-policies/cookies-policy...,Choose which cookies we use - NHS,html,"[﻿, html, \n, [if lt IE 9]><html class=""ie8"" l..."
255,2020-03-20,https://www.nhs.uk/live-well/healthy-weight/st...,Start the NHS weight loss plan - NHS,html,"[﻿, html, \n, [if lt IE 9]><html class=""ie8"" l..."
256,2020-03-20,https://www.nhs.uk/service-search/find-a-pharm...,Find a pharmacy - NHS,html,"[html, \n, \n, \n, \n, \n, \n, \n, Find a phar..."


In [1]:
import tika


In [2]:
parsed = parser.from_file('/Users/iain/Downloads/NOTICE 16_March.docx')
