# nlp time
Tests time to run spacy on article hits (~30 per claim). In this case, we look at the time taken for 100 claims, which is about 3000 articles.

In [1]:
import pickle
import spacy
from tqdm import tqdm

In [2]:
nlp = spacy.load("en_core_web_lg")
with open("data/phase2-validation/processed/responses.pkl", "rb") as fi:
    responses = pickle.load(fi)

In [3]:
# only sentence tokenizer and vectors
# time taken = ~20 seconds
for res in tqdm(responses[:100]):
    _ = [nlp(hit["article"].content, disable=["textcat", "tagger", "ner"]) for hit in res["res"]["hits"]["hits"]]

100%|██████████| 100/100 [04:53<00:00,  2.94s/it]


# similarity time
Now we're going to test how long it takes to find the n most relevant sentences for a claim given it's top 2 it's top two related articles.

In [4]:
import heapq
from valerie.preprocessing import clean_text, extract_words_from_url

In [5]:
def create_text_a(claim):
    text_a = claim.claim
    text_a += " "
    text_a += claim.claimant if claim.claimant else "no claimant"
    text_a += " "
    text_a += claim.date.split()[0].split("T")[0] if claim.date else "no date"
    return clean_text(text_a)

def create_text_b(article):
    text_b = ""
    if article.source:
        text_b += article.source + ". "
    if article.title:
        text_b += article.title + ". "
    if article.url:
        url_words = extract_words_from_url(article.url)
        if url_words:
            text_b += " ".join(url_words) + ". "
    if article.content:
        text_b += article.content
    return clean_text(text_b)

In [6]:
articles_dict = {}
claims_dict = {}

results = {}
for res in tqdm(responses[:100]):
    claim = res["claim"]
    claim.nlp = nlp(create_text_a(claim), disable=['parser', 'textcat', 'tagger', 'ner'])
    claims_dict[claim.index] = claim
    
    top2_articles = []
    for hit in res["res"]["hits"]["hits"][:2]:
        article = hit["article"]
        article.nlp = nlp(create_text_b(article), disable=['textcat', 'tagger', 'ner'])
        articles_dict[article.index] = article
        top2_articles.append(article)
        
    sentences = []
    for article in top2_articles:
        for sent in article.nlp.sents:
            sent_nlp = nlp(sent.text, disable=["textcat", "tagger", "parser", "ner"])
            sentences.append({
                "article_index": article.index,
                "sentence": sent.text,
                "score": sent_nlp.similarity(claim.nlp)
            })
    sentences = heapq.nlargest(5, sentences, key=lambda x: x["score"])
    results[claim.index] = sentences

100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


In [17]:
claim_idx = list(results.keys())[0]

In [18]:
claims_dict[claim_idx].nlp = None
claims_dict[claim_idx]

{
  "id": 383,
  "claim": "\"Huge! Results From Breaking Chloroquine Study Show 100% Cure Rate For Patients Infected With The Coronavirus.\"",
  "claimant": "Facebook post",
  "label": 0,
  "date": "2020-03-19T00:00:00Z",
  "related_articles": {
    "Phase2ValidationDataset/3682.html": "https://www.teaparty.org/huge-results-from-breaking-chloroquine-study-show-100-cure-rate-for-patients-infected-with-the-coronavirus-432559/",
    "Phase2ValidationDataset/3684.html": "https://www.foxnews.com/transcript/trump-administration-not-ruling-out-domestic-travel-restrictions-amid-coronavirus-pandemic",
    "Phase2ValidationDataset/3687.html": "https://www.wired.com/story/an-old-malaria-drug-may-fight-covid-19-and-silicon-valleys-into-it/",
    "Phase2ValidationDataset/3688.html": "https://www.nytimes.com/2020/03/19/health/coronavirus-drugs-chloroquine.html",
    "Phase2ValidationDataset/3689.html": "https://aidsinfo.nih.gov/drugs/569/chloroquine/0/patient"
  },
  "explanation": null,
  "support"

In [19]:
for d in results[claim_idx]:
    print(d["article_index"])
    print(d["score"])
    print(d["sentence"])
    print()

https://www.theguardian.com/world/2020/apr/06/hydroxychloroquine-trump-coronavirus-drug
0.900891937071296
this is how an experiment in which 15% of the treatment group and 0% of the control had poor clinical outcomes could end up being reported as showing a “100% cure rate”.

https://www.teaparty.org/huge-results-from-breaking-chloroquine-study-show-100-cure-rate-for-patients-infected-with-the-coronavirus-432559/
0.8817048878904338
school of medicine advisor announced a 100% cure rate in a controlled study done in france of 40 people with the #chinacoronavirus with a malaria drug called #hydroxychloroquine.

https://www.teaparty.org/huge-results-from-breaking-chloroquine-study-show-100-cure-rate-for-patients-infected-with-the-coronavirus-432559/
0.8746766339001871
results from breaking chloroquine study show 100% cure rate for patients infected with the coronavirus | tea party (gateway pundit) – on monday dr. anthony fauci, director of the national institute of allergy and infectious d