In [None]:
!pip3 install lxml requests numpy tqdm

## 1. Crawling

Let's start by downloading some relevant documents. For this challenge we will be using the medical guidelines published by the UK [National Institute for Health and Care Excellence (NICE)](https://www.nice.org.uk/), specifically the [Clinical Knowledge Summaries (CKS)](https://cks.nice.org.uk/).

Start by having a look at the website to familiarise yourself with the content. 


>**IMPORTANT:** The NICE CKS are only served to IP ranges from the UK and its territories, so to access this content from outside the UK you will need [a VPN](https://www.google.com/search?q=free%20vpn).

**Task:**
- identify and download every single guideline page from NICE CKS. Save it as plain HTML in the `data/cks/` folder. Here is one example guideline: https://cks.nice.org.uk/acute-kidney-injury
- extra bonus points if you make sure to only download files that were last modified at a later date than the last version of the file in the `data/` folder

*Tips:*
- Although when rendered by a web browser the users needs to click on each section to see it, in fact each of these guidelines is served as a single HTML page which makes them perfect for this exercise.
- in order to download them all, you need to find links to every single page
- the front CKS page is an AJAX site: it is dynamically rendered. The whole list of all guidelines is actually stored on the server as a text file and retrieved by the client to render. Identifying and retrieving this file would make your task much easier. Chrome dev tools is your friend :)
- make sure to save everything locally from the start. Restarting and redownloading is bad crawling practice: it may trigger some blocking!


In [None]:
import os
from os import listdir
from os.path import isfile, join, abspath

import requests # recommended library to use for crawling, but feel free to use any other

DATA_DIR = '../data'
CKS_DIR = abspath(join(DATA_DIR,'cks'))

def get_cks_urls():
    ROOT_URL = 'https://cks.nice.org.uk/'
    
    urls = []
    # your code here
    
    return urls

def crawl_cks(data_dir):
    os.makedirs(data_dir, exist_ok=True) # create the data directory if it doesn't exist
    urls = get_cks_urls()
    successful_urls = []
    
    for url in urls:
        text = ""
        filename = ""
        
        # your code here
        
        if success:
            with open(filename, 'w', encoding='utf8') as f:
                f.write(text)
        else:
            # your code here
            print(error_message)
    
    return successful_urls
    
crawl_cks(CKS_DIR)

## 2. Scraping

Each paragraph (or first-level bullet-point) on these pages roughtly equates to one possible answer to a clinician's query, so we will treat them as single paragraphs. 

**Your task is to:**
- extract the document title
- extract the description meta tag, if present
- extract last revision date
- split the document into smaller chunks that we'll call "paragraphs"
    - each first-level bullet point under &lt;section&gt; counts as one paragraph of text, including the text contents of all of its children elements. We need to extract the text from all HTML elements under the &lt;li&gt; and group it as one paragraph
    - each top-evel &lt;p&gt; under &lt;section&gt; is also a paragraph
- clean the text. For the purposes of this challenge, we want to remove all inline HTML tags but preserve the text.

**Tips**
- You will see that even after all this processing and cleaning, the data is still a mess. For example, in the CKS format, the lowest level of headers is rendered as &lt;p&gt;&lt;strong&gt;text&lt;/strong&gt;&lt;/p&gt;. This is a headache for building scrapers but for this exercise it doesn't matter much: we are not trying to retrieve perfectly formatted text, just to clean it a bit before indexing and to group it into vaguely useful chunks.


In [None]:
# The following utility methods are provided for convenience and needed for the scraping below
from lxml import html

IGNORE_SECTIONS = ['references',
                   'how this topic was developed',
                   'how up-to-date is this topic?',
                   'supporting evidence']

def stringify_children(node):
    "Returns the full HTML code of an elment in the parse tree as a string"
    from lxml.etree import tostring
    from itertools import chain
    import html

    parts = ([node.text] +
             list(chain(*([c.text, tostring(c).decode("utf-8"), c.tail] for c in node.getchildren()))) +
             [node.tail])
    text = ''.join(filter(None, parts))
    text = html.unescape(text)

    text = text.replace('\u00a0', ' ')
    text = text.replace('\u00a3', '£')
    text = clean_text(text)

    return text

def join_headers(paragraphs, min_len=150):
    """ Joins orphan header lines into the next paragraph. 
        A header is any line below a length of `min_len` """
    changes = True
    while changes:
        changes = False

        for index in range(len(paragraphs) - 1):
            if len(paragraphs[index]) < min_len:
                new_par = paragraphs[index] + ': ' + paragraphs[index + 1]
                paragraphs = paragraphs[:index] + [new_par] + paragraphs[index + 2:]
                changes=True
                break
    return paragraphs

def yield_paragraphs(doc):
    "A generator to export all paragraphs from a document"
    for section in doc.get('sections', []):
        for para in section.get('paragraphs', []):
            yield para

In [None]:
# Time for some regular expressions! I recommend https://regex101.com/ for live checking the 
# results of your code
import re

def clean_text(text):
    """Remove links and HTML formatting code from a string and normalise the text"""
    
    text = # your code here: remove all <a> tags that are bibliography references, including the text between the <a></a> tags
    
    text = # your code here: remove all text inside <strong> tags 
    text = # your code here: replace each list item tag with the string ' - '

    text = # your code here: remove all remaining HTML tags, but not the text between them
    
    text = re.sub('\[\s*\.?\s*\[', '[', text)
    text = re.sub('\]\s*\.?\s*\]', ']', text)

    text = # your code here: remove all text in between square brackets, e.g '[ some text ]'
    
    return text

In [None]:
from tqdm import tqdm

def scrape_cks_section(section):
    paragraphs = []
    title_nodes = section.xpath('header/*[self::h1 or self::h2 or self::h3 or self::h4]')
    if len(title_nodes) > 0:
        sec_title = title_nodes[0].text
    else:
        sec_title = ''

    # your code here: if the section title appears in the IGNORE_SECTIONS list, return None
    
    for item in section.xpath('*[self::p or self::ul or self::section]'):
        if item.tag == 'ul':
            # your code here
        elif item.tag == 'p':
            # your code here
        elif item.tag == 'section':
            sec_dict = scrape_cks_section(item)

            if sec_dict is None or len(sec_dict.get('paragraphs', [])) == 0:
                continue

            # your code here: if the section has a title, add the title as its own paragraph
            # use paragraphs.append()
            
            # your code here: add all paragraphs in the section to the `paragraphs` list

    res = {
        'title': sec_title,
        'paragraphs': join_headers(paragraphs)
    }
    return res


def scrape_cks(text):
    tree = html.fromstring(text)
    doc_title = # your code here: get the document title from inside <article> 
    doc_description = # your code here: get the document description from the <head>
    last_revision = # your code here: get the text for last_revision inside <article> 

    sections = []
    
    for section in tree.xpath('YOUR-CODE-HERE'): # an xpath selector to iterate through all <section> tags
        new_section = scrape_cks_section(section)
        if new_section:
            sections.append(new_section)

    doc = {
        'title': doc_title,
        'sections': sections
    }
    return doc

def scrape_cks_files(cks_dir):
    filenames = [join(cks_dir, f) for f in listdir(cks_dir) if
                 isfile(join(cks_dir, f)) and os.path.splitext(join(cks_dir, f))[1].startswith('.htm')]

    docs = []

    for filename in tqdm(filenames):
        with open(filename, 'r') as f:
            text = f.read()
        doc = scrape_cks(text)
        doc['filename'] = # your code here: the name of the source file, without the directory
        docs.append(doc)

    return docs

In [None]:
docs = scrape_cks_files(CKS_DIR)

## 3. Indexing

Once we have extracted and cleaned the text, we can now create useful representations of it for retrieval / question answering. There are infinitely ways of doing this, but for this exercise, we are going to use dense vector representations generated by Google's deep learning USE v4 model.

This is a near-state-of-the-art model that generates multilingual sentence embeddings, that is, turns a sequence of characters in a natural language into a multidimensional vector of real numbers. In theory, the closer the meaning of the sentences, the closer these vectors of real numbers will be in the 512-dimensional space.

In this model, the dimensions of the vector representing the document don't hold any specific meaning, they are all real numbers and the vector is of a predefined size: it does not grow with the growth of the vocabulary.

Our approach will be to embed each paragraph and each query with USE and find the nearest answer to each question by exhaustively computing the inner product of the query vector and the paragraph vector.

**Task:**
- generate an index of the documents by:
    - creating a large list of all the paragraphs in all the documents: `all_paragraphs`
    - compute the embedding vector for each paragraph: `vectors`
    - keep a list of dictionaries containing metadata for each paragraph: `doc_info`, including 'filename' (the original filename) and 'doc_par_num' (paragraph number inside the document)


In [None]:
# Run the following code to set up the environment
!pip3 install --upgrade tensorflow
!pip3 install  tensorflow-hub

In [None]:
# See https://tfhub.dev/google/universal-sentence-encoder/4 for details
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url) # this will take a good while the first time, it's a 1 gb download
print ("module %s loaded" % module_url)

def embed(input):
    return model(input)

In [None]:
def create_index(docs):
    """ 
    This function should return 3 lists, of the same length (total number of paragraphs):
        - doc_info: a list of dictionaries, one for each paragraph. 
           Each dict should at least contain the keys 
           - 'filename': the source file that the paragraph comes from
           - 'doc_par_num': the index of that paragraph inside the list of paragraphs 
           inside the doc dictionary (i.e. for each doc in docs)
              
        - all_paragraphs: a list of all paragraphs, flattened from each doc
        
        - vectors: an array/list of vectors, where each vector is generated using the 
        USE model loaded above 
    """
    
    # your code here

    return doc_info, all_paragraphs, vectors

In [None]:
doc_info, all_paragraphs, vectors = create_index(docs)

## 4. Retrieval / Question Answering

We are now going to try to use our index to answer a list of clinical questions from [a family medicine website](https://www.mdedge.com/familymedicine/clinical-inquiries). This is a way to informally evaluate how well we can answer these questions by simply retrieving the most relevant paragraph found in our collection.

We will compute the similarity using a very naive approach: the distance in space between the vectors generated by USE for the query and each paragraph.

**Task:**
- load questions from '../data/ebm_questions.txt'
- for each question, retrieve the best match from the collection of paragraphs
- output results to a CSV 
    - CSV should have these columns: question, answer, score, filename
    - results should be sorted by score in decreasing order (i.e. question with highest answer score first)
    - filename is the name of the CKS file you have saved locally
    - score is the inner product or cosine distance between the query and document vectors
- bonus points for exporting columns *in this order*: question, answer, score, filename
- bonus points for any suggestion on how to improve on this extremely naive approach
    
**Tips:**
- the easiest way to compute the similarity/score between query and document is the inner product. Numpy is your friend. So is Google :)
- if you find that the answers coming out are terrible, don't worry, this is expected!

In [None]:
Q_FILE = '../data/ebm_questions.txt'

questions = # your code here: load the list of questions into this variable

In [None]:
import numpy as np

def retrieval(query):
    q_vector = # your code here: generate vector for the query
    
    scores = # your code here: compute the scores between the query and every paragraph in `vectors`
    docs = # your code here: create a list of tuples where the first element is a counter from 
           # 0 to len(scores)-1, and the second element is the corresponding element at that 
           # position in the scores array
    results = # your code here: sort the elements in the `docs` list by their score, 
              # in reverse order
    
    top_result = {
        'filename': doc_info[results[0][0]]['filename'], 
        'score': # your code here,
        'answer': # your code here,
    }

    return top_result

In [None]:
import pandas as pd # you may find it useful for CSV export

def answer_questions(questions):
    results=[]
    for q in questions:
        answer = retrieval(q)
        answer['question'] = q
        
        results.append(answer)
    return results

def results_to_csv(results, filename):
    # your code: sort results by score, descending 
    # your code: export results to csv file `filename`

In [None]:
# compute answers and export to a CSV
results_to_csv(answer_questions(questions), 'answers.csv')

## 5. Submitting results

**Task:**  Please save, zip and send this `.ipynb` file and the `answers.csv` file (if you have generated it) to [dan@medwise.ai](mailto:dan@medwise.ai). 