# Data
---

In [1]:
import os
import json
import heapq
import pickle
import random
import multiprocessing

import spacy
from tqdm.notebook import tqdm

from valerie.utils import get_logger
from valerie.preprocessing import extract_words_from_url, clean_text
from valerie.scoring import validate_predictions_phase2, compute_score_phase2
from valerie.modeling import SequenceClassificationModel, SequenceClassificationDataset, SequenceClassificationExample

In [2]:
nlp = spacy.load("en_core_web_lg")

In [None]:
_logger = get_logger()

In [None]:
with open("data/phase2-validation-100/processed/responses.pkl", "rb") as fi:
    responses = pickle.load(fi)

In [None]:
len(responses)

In [None]:
def compute_responses_score(results, claims_dict):
    predictions = {}
    perfect_predictions = {}
    labels = {}

    for k, hits in results.items():
        claim = claims_dict[k]
        labels[claim.id] = claim.to_dict()
        
        hits = sorted(hits, key=lambda x: x[1], reverse=True) # sort by score
        predictions[claim.id] = {
            "label": claim.label,
            "explanation": "",
            "related_articles": {
                i + 1: x
                for i, x in enumerate([v[0] for v in hits[:2]])
            }
        }
        perfect_predictions[claim.id] = {
            "label": claim.label,
            "explanation": "",
            "related_articles": {
                i + 1: x
                for i, x in enumerate([v[0] for v in hits if v[0] in claim.related_articles.values()][:2])
            }
        }

    validate_predictions_phase2(predictions)
    score = compute_score_phase2(labels, predictions)
    validate_predictions_phase2(perfect_predictions)
    perfect_score = compute_score_phase2(labels, perfect_predictions)
    return {
        "perfect_rerank_score": perfect_score["score"],
        "perfect_rerank_error": perfect_score["error"],
        "api_score": score["score"],
        "api_error": score["error"],
    }

In [None]:
def create_text_a(claim):
    text_a = claim.claim
    text_a += " "
    text_a += claim.claimant if claim.claimant else "no claimant"
    text_a += " "
    text_a += claim.date.split()[0].split("T")[0] if claim.date else "no date"
    return clean_text(text_a)

def create_text_b_content(article):
    text_b = ""
    if article.source:
        text_b += article.source + ". "
    if article.title:
        text_b += article.title + ". "
    if article.url:
        url_words = extract_words_from_url(article.url)
        if url_words:
            text_b += " ".join(url_words) + ". "
    if article.content:
        text_b += article.content
    return clean_text(text_b)

# Run Spacy on Data

### Claims

In [None]:
misses = 0
claims_list = []
for res in tqdm(responses):
    if not res["res"]:
        misses += 1
        continue
    claim = res["claim"]
    claim.text_a = create_text_a(claim)
    claim.res = res
    claim.support = {}
    claims_list.append(claim)

In [None]:
misses

In [None]:
claims_texts = [claim.text_a for claim in claims_list]

In [None]:
claims_docs = [doc for doc in tqdm(nlp.pipe(claims_texts, n_process=16, disable=["textcat", "tagger", "parser", "ner"]), total=len(claims_texts))]

In [None]:
claims_dict = {}
for claim, doc in tqdm(zip(claims_list, claims_docs)):
    claim.doc = doc
    claims_dict[claim.index] = claim

In [13]:
len(claims_list)

100

In [14]:
len(set(claims_list))

100

### Articles

In [15]:
misses = 0
articles_list = []
for res in tqdm(responses):
    if not res["res"]:
        misses += 1
        continue
    for hit in res["res"]["hits"]["hits"]:
        article = hit["article"]
#         article.text_b = create_text_b_content(article)
        articles_list.append(article)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [16]:
misses

0

In [17]:
len(articles_list)

2822

In [18]:
len(set(articles_list))

1973

In [19]:
articles_list = list(set(articles_list))

In [20]:
def _text_b_text(article):
    return article, create_text_b_content(article)

articles_texts = {}
pool = multiprocessing.Pool(16)
for article, text_b in tqdm(pool.imap_unordered(_text_b_text, articles_list), total=len(articles_list)):
    articles_texts[article.index] = text_b

HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))




In [21]:
for article in articles_list:
    article.text_b = articles_texts[article.index]

In [22]:
articles_texts = [article.text_b for article in tqdm(articles_list)]

HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))




In [23]:
articles_docs = [doc for doc in tqdm(nlp.pipe(articles_texts, n_process=16, disable=["textcat", "tagger", "ner"]), total=len(articles_texts))]

HBox(children=(FloatProgress(value=0.0, max=1973.0), HTML(value='')))




In [24]:
articles_dict = {}
for article, doc in tqdm(zip(articles_list, articles_docs)):
    article.doc = doc
    articles_dict[article.index] = article

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# Examples
---

In [25]:
def create_text_b_curated(article, claim):
    support = []
    for sent in article.doc.sents:
        support.append({
            "text": sent.text,
            "score": claim.doc.similarity(sent)
        })
    support = heapq.nlargest(32, support, key=lambda x: x["score"])
    claim.support[article.index] = support
    text_b = clean_text(" ".join([s["text"] for s in support]))
    return text_b

In [26]:
examples = []
for claim in tqdm(claims_dict.values()):
    hits_indices = [hit["url"] for hit in claim.res["res"]["hits"]["hits"]]
    hits = [articles_dict[idx] for idx in hits_indices]
    
    related_articles_url_set = set(claim.related_articles.values())

    for article in hits:
        article.text_b = create_text_b_curated(article, claim)

        examples.append(SequenceClassificationExample(
            guid=claim.index,
            text_a=claim.text_a,
            text_b=article.text_b,
            label=1 if article.url in related_articles_url_set else 0,
            art_id=article.index
        ))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

  





In [27]:
print(len(claims_dict))
print(len(articles_dict))
print()
print(len(claims_dict)*30)
print(len(examples))

100
1973

3000
2822


In [28]:
examples[0]

{
  "guid": "Phase2Validation100Dataset/52",
  "text_a": "U.S. President Donald Trump has voted via mailed absentee ballot in United States elections. no claimant 2020-05-21",
  "text_b": "the state authorized no-reason absentee voting in a referendum in 2018. trump threatens to stop funding for michigan if absentee ballot forms sent to voters secretary of state jocelyn benson speaks outside absentee ballot counting on march, 10, 2020, at the tcf center in detroit. washington \u2013 president donald trump on wednesday threatened funding for michigan amid a global health pandemic if state officials move ahead with plans to send absentee ballot applications to every state voter. the president said if secretary of state jocelyn benson sends out absentee ballot applications to voters he will withhold funding, suggesting its illegal. free story news politics elections donald trump michigan absentee ballot applications. \" breaking: michigan sends absentee ballots to 7.7 million people ahead

In [83]:
list(claims_dict.values())[0].claim

'U.S. President Donald Trump has voted via mailed absentee ballot in United States elections.'

In [84]:
# print(json.dumps(list(claims_dict.values())[0].support, indent=2))

In [30]:
print(len(responses)*16)
print(len(examples))

1600
2822


# Predict
---

In [31]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "dryrun"
os.environ["WANDB_WATCH"] = "false"

In [32]:
# "castorini/monot5-base-msmarco"
# "castorini/monobert-large-msmarco"
# "nboost/pt-bert-large-msmarco"]:
pretrained_model_name_or_path = "castorini/monobert-large-msmarco"

In [33]:
model = SequenceClassificationModel.from_pretrained(pretrained_model_name_or_path)

[2020-07-14 01:03:34,870] INFO:transformers.configuration_utils: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/castorini/monobert-large-msmarco/config.json from cache at /home/jay/.cache/torch/transformers/643500d870067d59f219f7b5652919267c01bfa98024e2e74f53b28c1b6aff2b.4c88e2dec8f8b017f319f6db2b157fee632c0860d9422e4851bd0d6999f9ce38
[2020-07-14 01:03:34,872] INFO:transformers.configuration_utils: Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

[2020-07-14 01:03:34,872] INFO:transformers.tokenization_utils_base: Model name 'castorini/monobert-large-msmarco' not f

In [34]:
examples_dataset = model.create_dataset(examples, nproc=16)

[2020-07-14 01:03:44,735] INFO:valerie.modeling: ... converting examples to features ...


HBox(children=(FloatProgress(value=0.0, description='converting examples to features', max=2822.0, style=Progr…




In [36]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"
!echo $CUDA_VISIBLE_DEVICES

0,1,2,3,4,5,6,7


In [37]:
predict_output = model.predict(examples_dataset, predict_batch_size=256)

[2020-07-14 01:04:45,529] INFO:transformers.training_args: PyTorch: setting up devices
[2020-07-14 01:04:49,207] INFO:transformers.trainer: Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[2020-07-14 01:04:49,210] INFO:transformers.trainer: ***** Running Prediction *****
[2020-07-14 01:04:49,210] INFO:transformers.trainer:   Num examples = 2822
[2020-07-14 01:04:49,210] INFO:transformers.trainer:   Batch size = 2048


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=2.0, style=ProgressStyle(description_wid…






In [38]:
claims_dict = {res["claim"].index: res["claim"] for res in responses if res["res"]}
api_scores_dict = {
    res["claim"].index: {
        hit["article"].index: hit["score"] for hit in res["res"]["hits"]["hits"]
    }
    for res in responses
    if res["res"]
}


rerank_just_api_responses = {
    res["claim"].index: [
        (hit["article"].index, hit["score"]) for hit in res["res"]["hits"]["hits"]
    ]
    for res in responses
    if res["res"]
}

rerank_just_trans_responses = {res["claim"].index: [] for res in responses if res["res"]}

rerank_both_responses = {
    res["claim"].index: []
    for res in responses
    if res["res"]
}

for example, proba in tqdm(zip(examples, predict_output.predictions)):
    proba = float(proba[1]) # get probability that the article is related

    rerank_just_trans_responses[example.guid].append((example.art_id, proba))
    rerank_both_responses[example.guid].append((example.art_id, proba + api_scores_dict[example.guid][example.art_id]))
    
print('api')
print(json.dumps(compute_responses_score(rerank_just_api_responses, claims_dict), indent=2))
print()
print('trans')
print(json.dumps(compute_responses_score(rerank_just_trans_responses, claims_dict), indent=2))
print()
print('both')
print(json.dumps(compute_responses_score(rerank_both_responses, claims_dict), indent=2))
print()
print()
print()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


api
{
  "perfect_rerank_score": 0.9617898441427853,
  "perfect_rerank_error": "'None'",
  "api_score": 0.5780794369029664,
  "api_error": "'None'"
}

trans
{
  "perfect_rerank_score": 0.9617898441427853,
  "perfect_rerank_error": "'None'",
  "api_score": 0.5558069381598794,
  "api_error": "'None'"
}

both
{
  "perfect_rerank_score": 0.9617898441427853,
  "perfect_rerank_error": "'None'",
  "api_score": 0.6082453494218201,
  "api_error": "'None'"
}





# Manual Inspection

In [75]:
for k, hits in rerank_just_api_responses.items():
     rerank_just_api_responses[k] = sorted(hits, key=lambda x: x[1], reverse=True)
    
for k, hits in rerank_just_trans_responses.items():
     rerank_just_trans_responses[k] = sorted(hits, key=lambda x: x[1], reverse=True)

for k, hits in rerank_both_responses.items():
     rerank_both_responses[k] = sorted(hits, key=lambda x: x[1], reverse=True)

In [107]:
example_idx = list(rerank_just_api_responses.keys())[4]

In [108]:
print(claims_dict[example_idx].logstr())

{
  "id": 59,
  "claim": "Racial comparisons on coronavirus statistics in Monroe County, N.Y., show that the impact on minorities is \u201cbasically on par, a little bit up, from the population numbers,\u201d meaning that \u201cwe don\u2019t see that disparity as much here.\u201d",
  "claimant": "Lovely Warren",
  "date": "2020-04-08 00:00:00"
}


In [109]:
print(claims_dict[example_idx].label)

1


In [110]:
print(json.dumps(claims_dict[example_idx].related_articles, indent=2))

{
  "Phase2Validation100Dataset/2479.html": "https://www.cdc.gov/nchs/nvss/vsrr/covid_weekly/",
  "Phase2Validation100Dataset/2482.html": "https://www.census.gov/quickfacts/monroecountynewyork"
}


In [111]:
rerank_just_api_responses[example_idx][:5]

[('http://www.sentencingproject.org/publications/color-of-justice-racial-and-ethnic-disparity-in-state-prisons/',
  72.98848),
 ('https://www.sentencingproject.org/publications/color-of-justice-racial-and-ethnic-disparity-in-state-prisons/',
  72.964645),
 ('https://www.cbc.ca/news/world/covid-19-us-canada-death-rates-1.5553168?fbclid=iwar1jx3u-vaussuwhpci4s0phagtxukcnzyytfbmt6g54ubm4jck6qqrjqts',
  70.62437),
 ('https://www.sentencingproject.org/publications/un-report-on-racial-disparities/',
  64.58491),
 ('https://www.hrw.org/news/2009/06/19/race-drugs-and-law-enforcement-united-states#_ftn17',
  63.59014)]

In [112]:
rerank_just_trans_responses[example_idx][:5] 

[('https://www.cbc.ca/news/world/covid-19-us-canada-death-rates-1.5553168?fbclid=iwar1jx3u-vaussuwhpci4s0phagtxukcnzyytfbmt6g54ubm4jck6qqrjqts',
  2.959611415863037),
 ('https://www.vox.com/2020/5/4/21242750/coronavirus-covid-19-united-states-canada-trump-trudeau',
  1.7351104021072388),
 ('https://www.nytimes.com/2020/05/01/world/canada/america-canada-coronavirus-comparison.html',
  -1.714412808418274),
 ('https://www.statnews.com/2020/04/17/influential-covid-19-model-uses-flawed-methods-shouldnt-guide-policies-critics-say/',
  -1.978223443031311),
 ('https://www.npr.org/sections/goatsandsoda/2020/03/20/815408287/how-the-novel-coronavirus-and-the-flu-are-alike-and-different',
  -2.0584213733673096)]

In [113]:
rerank_both_responses[example_idx][:5]

[('https://www.cbc.ca/news/world/covid-19-us-canada-death-rates-1.5553168?fbclid=iwar1jx3u-vaussuwhpci4s0phagtxukcnzyytfbmt6g54ubm4jck6qqrjqts',
  73.58398141586304),
 ('http://www.sentencingproject.org/publications/color-of-justice-racial-and-ethnic-disparity-in-state-prisons/',
  69.76469507644653),
 ('https://www.sentencingproject.org/publications/color-of-justice-racial-and-ethnic-disparity-in-state-prisons/',
  69.74086007644654),
 ('https://www.sentencingproject.org/publications/un-report-on-racial-disparities/',
  60.10208233657836),
 ('https://www.hrw.org/news/2009/06/19/race-drugs-and-law-enforcement-united-states#_ftn17',
  58.874018326416014)]

# Results restricted 16 and 16

In [35]:
# results when restring text_b to first 16 most relevant sentences and resulting the hits to the first 16 retrieved articles

# api
# {
#   "perfect_rerank_score": 0.9617898441427853,
#   "perfect_rerank_error": "'None'",
#   "api_score": 0.5780794369029664,
#   "api_error": "'None'"
# }

# trans
# {
#   "perfect_rerank_score": 0.8001005530417297,
#   "perfect_rerank_error": "'None'",
#   "api_score": 0.5578179989944696,
#   "api_error": "'None'"
# }

# both
# {
#   "perfect_rerank_score": 0.8001005530417297,
#   "perfect_rerank_error": "'None'",
#   "api_score": 0.6082453494218201,
#   "api_error": "'None'"
# }