In [4]:
import torch
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv("data/nhs_corona.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,url,title,format,text
0,0,2020-03-20,https://www.england.nhs.uk/coronavirus/,,text/html; charset=UTF-8,Coronavirus\nSkip to main content\nCookies on ...
1,1,2020-03-20,https://www.england.nhs.uk/accessibility/,,text/html; charset=UTF-8,NHS England » About accessibility\nSkip to mai...
2,2,2020-03-20,https://www.england.nhs.uk/coronavirus/publica...,,text/html; charset=UTF-8,Coronavirus » Letter: Covid-19 and professiona...
3,3,2020-03-20,https://www.gov.uk/government/publications/cov...,,text/html; charset=UTF-8,COVID-19: guidance for households with possibl...
4,4,2020-03-20,https://www.gov.uk/government/organisations,,text/html; charset=UTF-8,"Departments, agencies and public bodies - GOV...."


### Helper for text

In [7]:
from bs4 import BeautifulSoup 

import spacy
nlp = spacy.load("en_core_web_sm")

In [8]:
def get_sentences(html):
    soup = BeautifulSoup(html) 
    spacy_text =  nlp(soup.get_text())
    return spacy_text.sents

sentences = list(get_sentences(df['text'].values[0]))

In [32]:
sentences[0].string

'Coronavirus\nSkip to main content\nCookies on the NHS England and NHS Improvement website\n'

In [33]:
# need to revisit the unit of retrieval -- this often yields 
# very short "sentences" (one word) -- should probably drop at the
# very least. maybe chunk docs into equal sized blocks? or something?
all_docs = []
for d in df['text'].values:
    try:
        all_docs.append(list(get_sentences(d)))
    except:
        print("ah! DANGER")
        print(d)

ah! DANGER
nan


### BERT

In [14]:
# https://github.com/UKPLab/sentence-transformers
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
#corpus_embeddings = embedder.encode([s.string for s in all_sentences])

In [58]:
def embed_doc(doc_sents, get_mean=True):
    doc_vecs = embedder.encode([s.string for s in doc_sents])
    if get_mean:
        return np.mean(doc_vecs, axis=0)
    return doc_vecs

In [48]:
def best_match_in_doc(q_v, doc_embeddings, k=5):
    distances = scipy.spatial.distance.cdist(q_v, doc_embeddings, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    return results[:k]

In [57]:
# this takes a bit but idea is only have to do at start up.
corpus_embeddings = [embed_doc(doc) for doc in all_docs]

In [45]:
import scipy
query = "testing"
query_embedding = embedder.encode([query])
distances = scipy.spatial.distance.cdist(query_embedding, corpus_embeddings, "cosine")[0]

In [46]:
len(distances)

287

In [49]:
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar docs in corpus:")

for idx, distance in results[0:1]:
    print("\n\n")
    print(all_docs[idx], "(Score: %.4f)" % (1-distance))





Query: testing

Top 5 most similar docs in corpus:



[letter
Page 1 of 1
19 March 2020
, To:
CEOs of NHS and Foundation Trusts
CEOs of Clinical Commissioning Groups
Directors of Public Health
CEOs of Community Health Providers
CEOs of private and not-for-profit community providers
CEOs for community interest companies
, Cc:
NHS England and NHS Improvement Regional Directors
Chief Executives of Councils
, COVID-19 Prioritisation within Community Health Services
Following on from Sir Simon Stevens’ and, Amanda Pritchard’s letter of 17 March 2020, this letter
and annex set out how providers of community services can release capacity to support the COVID-
19 preparedness and response., These arrangements will apply until 31 July 2020 in the first in-
stance.
, The current priorities for providers of community services during this pandemic are:
1., Support home discharge today of patients from acute and community beds, as mandated in
the new Hospital Discharge Service Requirements, and

In [66]:
all_docs[idx][1].string


'To:\nCEOs of NHS and Foundation Trusts\nCEOs of Clinical Commissioning Groups\nDirectors of Public Health\nCEOs of Community Health Providers\nCEOs of private and not-for-profit community providers\nCEOs for community interest companies\n'

In [73]:
doc_sent_embeddings = embed_doc(all_docs[idx], get_mean=False)
res = best_match_in_doc(query_embedding,  doc_sent_embeddings)

'Screening Programme\n'

In [72]:
query

'testing'

In [None]:
all_docs[idx][res[1][0]].string

In [56]:
 corpus_embeddings[idx].shape

(768,)

#### ^^^ use above

In [7]:
from transformers import *

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
BERT = BertModel.from_pretrained(pretrained_weights)

In [85]:
def embed_text(text):
    input_ids = torch.tensor([tokenizer.encode(text, max_length=512, add_special_tokens=True)])
    with torch.no_grad():
        last_hidden_states = BERT(input_ids)[0]  
        return np.mean(last_hidden_states.detach().numpy(), axis=1).squeeze()

In [86]:
embed_text("hello world").shape

(768,)

In [87]:
embed_text(sentences[0].string).shape

(768,)

In [88]:
from scipy.spatial.distance import cosine

In [89]:
def find_best(q, sents):
    embeds = [embed_text(s.string) for s in sents]
    q_v = embed_text(q)
    scores = [cosine(q_v, v) for v in embeds]
    return sents[np.argmax(scores)], np.max(scores)

In [90]:
best_sent, best_score = find_best("hello", sentences)

In [96]:
# this is hopefully inefficient
def find_best_over_docs(q, docs):
    best_score, best_sent = -1, None
    for d in docs:
        #
        cur_sent, cur_score = find_best(q, d)
        if cur_score > best_score:
            best_score = cur_score
            best_sent = cur_sent
    return best_sent 

ah! DANGER
nan


In [93]:
len(all_docs)

287

In [101]:
a_doc = all_docs[0]
test = [torch.tensor([tokenizer.encode(s_i, max_length=512, add_special_tokens=True)]) for s_i in a_doc]
#test = embed_text(all_docs[0])

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

In [94]:
df.shape

(288, 6)

In [95]:
find_best_over_docs("test available?", all_docs)

Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


）
 如果您与他人一起居住，并且您或其中有人患有冠状病毒症状，则所有家
庭成员均必须呆在家里且不得离家 14 天。这一 14 天的期限从家里第一
个人生病的那一天开始
 居住在一个家庭中的人可能会相互感染或已经被感染。在家呆 14 天将大
大减少家庭可能传染给社区其他人的总感染量
 对于开始出现症状的家庭中的任何人，他们都需要从出现症状开始在家里
呆 7 天，而无论他们当时处于最初 14 天隔离期的哪一天。（有关更多信
息，请参见下面的结束隔离章节）
 如果可以的话，请将任何弱势群体人员（例如老人和具有基础健康状况的
人）转出您的家，在家庭隔离期间与朋友或家人呆在一起
 如果您不能将弱势群体人员转出家，请尽可能远离他们
 如果您有冠状病毒症状：
 不要前往全科医生诊所、药房或医院
