In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/nhs_corona.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,url,title,format,text
0,0,2020-03-20,https://www.england.nhs.uk/coronavirus/,,text/html; charset=UTF-8,Coronavirus\nSkip to main content\nCookies on ...
1,1,2020-03-20,https://www.england.nhs.uk/accessibility/,,text/html; charset=UTF-8,NHS England » About accessibility\nSkip to mai...
2,2,2020-03-20,https://www.england.nhs.uk/coronavirus/publica...,,text/html; charset=UTF-8,Coronavirus » Letter: Covid-19 and professiona...
3,3,2020-03-20,https://www.gov.uk/government/publications/cov...,,text/html; charset=UTF-8,COVID-19: guidance for households with possibl...
4,4,2020-03-20,https://www.gov.uk/government/organisations,,text/html; charset=UTF-8,"Departments, agencies and public bodies - GOV...."


### Helper for text

In [4]:
from bs4 import BeautifulSoup 

import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
def get_sentences(html):
    soup = BeautifulSoup(html) 
    spacy_text =  nlp(soup.get_text())
    return spacy_text.sents

sentences = list(get_sentences(df['text'].values[0]))

In [33]:
sentences[0].string

'Coronavirus\nSkip to main content\nCookies on the NHS England and NHS Improvement website\n'

### BERT

In [7]:
from transformers import *

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
BERT = BertModel.from_pretrained(pretrained_weights)

In [77]:
def embed_text(text):
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    with torch.no_grad():
        last_hidden_states = BERT(input_ids[:512])[0]  
        return np.mean(last_hidden_states.detach().numpy(), axis=1).squeeze()

In [78]:
embed_text("hello world").shape

(768,)

In [51]:
embed_text(sentences[0].string).shape

(768,)

In [21]:
from scipy.spatial.distance import cosine

In [79]:
def find_best(q, sents):
    embeds = [embed_text(s.string) for s in sents]
    q_v = embed_text(q)
    scores = [cosine(q_v, v) for v in embeds]
    return sents[np.argmax(scores)], np.max(scores)

In [None]:
best_sent, best_score = find_best("hello", sentences)

In [74]:
# this is hopefully inefficient
def find_best_over_docs(q, docs):
    best_score, best_sent = -1, None
    for d in docs:
        #import pdb; pdb.set_trace()
        cur_sent, cur_score = find_best(q, d)
        if cur_score > best_score:
            best_score = cur_score
            best_sent = cur_sent
    return best_sent 

In [75]:
all_docs = []
for d in df['text'].values:
    try:
        all_docs.append(list(get_sentences(d)))
    except:
        print("ah! DANGER")
        print(d)

ah! DANGER
nan


In [69]:
len(all_docs)

287

In [64]:
df.shape

(288, 6)

In [76]:
find_best_over_docs("test available?", all_docs)

Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: index out of range: Tried to access index 512 out of table with 511 rows. at /Users/distiller/project/conda/conda-bld/pytorch_1579022061893/work/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418