In [4]:
from pipeline import *
from pipeline import _logger, _sequence_classification

In [5]:
import types

parser_args = types.SimpleNamespace()
parser_args.metadata_file = "data/phase2-validation-100/raw/metadata.json"
parser_args.predictions_file = "logs/predictions.json"
parser_args.claimant_model_file = "models/claimant_model.json"
parser_args.fnc_model_dir = "models/phase2/single-claim-claimant-date/roberta-large-combined-128-2"
parser_args.rerank_model_dir = "models/rerank/castroni/monobert-large-msmarco"
parser_args.predict_batch_size = 128
parser_args.nproc = 8

In [None]:
#####################
### read metadata ###
#####################
log_title(_logger, "reading claims from {}".format(parser_args.metadata_file))
claims = get_claims(metadata_file=parser_args.metadata_file)
log_msg = ""
for i, claim in enumerate(claims):
    if i >= 5:
        break
    log_msg += "\n{}\n".format(claim.logstr())
_logger.info("first 5 claims:\n%s", log_msg)

######################
### process claims ###
######################
log_title(_logger, "process claims")
claim_docs_dict = generate_claim_docs_dict(claims=claims)
log_msg = ""
for i, claim in enumerate(claims):
    claim.doc = claim_docs_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.doc.text)
_logger.info("first 5 claim doc texts:\n%s", log_msg)

claim_text_a_dict = generate_text_a_dict(claims=claims)
log_msg = ""
for i, claim in enumerate(claims):
    claim.text_a = claim_text_a_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.text_a)
_logger.info("first 5 claim text_a:\n%s", log_msg)

#######################
### fetch responses ###
#######################
log_title(_logger, "fetching query responses")
queries, responses = get_responses(claims=claims, nproc=parser_args.nproc)

log_msg = ""
null_responses = 0
for i, claim in enumerate(claims):
    claim.query = queries[claim.id]
    claim.res = responses[claim.id]

    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.query)

    if not claim.res:
        null_responses += 1

_logger.info("  num_claims:         %d", len(claims))
_logger.info("  null_responses:     %d", null_responses)
_logger.info("first 5 generated queries:\n%s", log_msg)

########################
### process articles ###
########################
log_title(_logger, "process related articles")
articles_dict = get_articles_dict(claims=claims)
_logger.info("  num articles in articles_dict:  %d", len(articles_dict))

hits_dict = get_hits_dict(claims=claims, articles_dict=articles_dict)
log_msg = ""
num_total_hits = 0
no_hits_claims = 0
for i, claim in enumerate(claims):
    claim.hits = hits_dict[claim.id]
    num_total_hits += len(claim.hits)
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(
            claim.id,
            json.dumps(
                [
                    {"score": hit["score"], "url": hit["url"]}
                    for hit in claim.hits[:5]
                ],
                indent=2,
            ),
        )
    if not claim.hits:
        no_hits_claims += 1
_logger.info("  num_total_hits:     %d", num_total_hits)
_logger.info("  no_hits_claims:     %d", no_hits_claims)
_logger.info("first 5 claim hits (resticted to top 5 hits per claim):\n%s", log_msg)

article_docs_dict = generate_article_docs_dict(articles=articles_dict.values())
for article in articles_dict.values():
    article.doc = article_docs_dict[article.id]

########################
### generate support ###
########################
log_title(_logger, "generating support")
support_dict = generate_support_dict(claims=claims, keep_top_n_sentences=32)
log_msg = ""
for i, claim in enumerate(claims):
    claim.support = support_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(
            claim.id,
            json.dumps(
                {k: v[:3] for k, v in list(claim.support.items())[:3]}, indent=2
            ),
        )
_logger.info(
    "first 5 claim support (top 3 articles, top 3 sentences):\n%s", log_msg
)

##############
### rerank ###
##############
log_title(_logger, "rerank and select top 2 articles")
rerank_hits_dict = rerank_hits(
    claims=claims,
    rerank_model_dir=parser_args.rerank_model_dir,
    predict_batch_size=parser_args.predict_batch_size,
    keep_top_n=2,  # make sure we only keep the top two results
    nproc=parser_args.nproc,
)
log_msg = ""
for i, claim in enumerate(claims):
    claim.related_articles = rerank_hits_dict[claim.id]
    if i < 5:
        for j, article_id in claim.related_articles.items():
            log_msg += "\nclaim_id = {}\narticle #={}\n{}\n".format(
                claim.id, str(j), articles_dict[article_id].logstr(),
            )
_logger.info("first 5 claims chosen reranked articles:\n%s", log_msg)

###############################
### sequence classification ###
###############################
log_title(_logger, "generating sequence classification predictions")
seq_clf_predictions, seq_clf_explanations = sequence_classification(
    claims=claims,
    fnc_model_dir=parser_args.fnc_model_dir,
    predict_batch_size=parser_args.predict_batch_size,
    nproc=parser_args.nproc,
)
log_msg = ""
for i, claim in enumerate(claims):
    if i > 5:
        break
    log_msg += "\nclaim_id = {}\npred: {}\nexplanation: {}\n".format(
        claim.id,
        str(seq_clf_predictions[claim.id]),
        seq_clf_explanations[claim.id],
    )
_logger.info("first 5 sequence classification output results:\n%s", log_msg)

###############################
### claimant classification ###
###############################
log_title(_logger, "generating claimant predictions")
claimant_predictions, claimant_explanations = claimant_classification(
    claims=claims, claimant_model_file=parser_args.claimant_model_file
)
log_msg = ""
for i, claim in enumerate(claims):
    if i > 5:
        break
    log_msg += "\nclaim_id = {}\npred: {}\nexplanation: {}\n".format(
        claim.id,
        str(claimant_predictions[claim.id]),
        claimant_explanations[claim.id],
    )
_logger.info("first 5 claimant classification output results:\n%s", log_msg)

[2020-07-18 02:06:30,604] INFO:root: 
[2020-07-18 02:06:30,605] INFO:root: ................................................................................
[2020-07-18 02:06:30,606] INFO:root: ....... reading claims from data/phase2-validation-100/raw/metadata.json .......
[2020-07-18 02:06:30,607] INFO:root: ................................................................................
[2020-07-18 02:06:30,608] INFO:root: 


HBox(children=(FloatProgress(value=0.0, description='Phase2Dataset to claims', style=ProgressStyle(description…

[2020-07-18 02:06:30,682] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-18 02:06:30,684] INFO:valerie.datasets: len of claims: 100
[2020-07-18 02:06:30,685] INFO:root: first 5 claims:

{
  "id": 0,
  "claim": "\u201cPelosi\u2019s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.\u201d",
  "claimant": "Facebook post",
  "date": "2020-05-12 00:00:00"
}

{
  "id": 1,
  "claim": "There have been 317 criminal indictments under three recent Republican presidents and only three indictments under three recent Democratic presidents.",
  "claimant": "Facebook posts",
  "date": "2019-12-29 00:00:00"
}

{
  "id": 2,
  "claim": "\"After Nov. 3, coronavirus will magically all of a sudden go away and disappear.\u201d",
  "claimant": "Eric Trump",
  "date": "2020-05-16 00:00:00"
}

{
  "id": 3,
  "claim": "North Carolina is \"in the small minority of states that requires an absentee ballot to be signed by two witnesses 




HBox(children=(FloatProgress(value=0.0, description='running spacy on claim.claim', style=ProgressStyle(descri…

[2020-07-18 02:06:30,776] INFO:root: first 5 claim doc texts:

claim_id = 0
“Pelosi’s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.”

claim_id = 1
There have been 317 criminal indictments under three recent Republican presidents and only three indictments under three recent Democratic presidents.

claim_id = 2
"After Nov. 3, coronavirus will magically all of a sudden go away and disappear.”

claim_id = 3
North Carolina is "in the small minority of states that requires an absentee ballot to be signed by two witnesses or a notary public."

claim_id = 4
“Clearly, the Obama administration did not leave any kind of game plan for something like this.”






HBox(children=(FloatProgress(value=0.0, description='running spacy on claim text_a', style=ProgressStyle(descr…

[2020-07-18 02:06:30,834] INFO:root: first 5 claim text_a:

claim_id = 0
“Pelosi’s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.” Facebook post 2020-05-12

claim_id = 1
There have been 317 criminal indictments under three recent Republican presidents and only three indictments under three recent Democratic presidents. Facebook posts 2019-12-29

claim_id = 2
"After Nov. 3, coronavirus will magically all of a sudden go away and disappear.” Eric Trump 2020-05-16

claim_id = 3
North Carolina is "in the small minority of states that requires an absentee ballot to be signed by two witnesses or a notary public." Jay Chaudhuri 2020-04-10

claim_id = 4
“Clearly, the Obama administration did not leave any kind of game plan for something like this.” Mitch McConnell 2020-05-11

[2020-07-18 02:06:30,836] INFO:root: 
[2020-07-18 02:06:30,837] INFO:root: ................................................................................
[




HBox(children=(FloatProgress(value=0.0, description='generating queries', style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='fetching query responses', style=ProgressStyle(descriptio…

[2020-07-18 02:07:11,061] INFO:root:   num_claims:         100
[2020-07-18 02:07:11,063] INFO:root:   null_responses:     0
[2020-07-18 02:07:11,064] INFO:root: first 5 generated queries:

claim_id = 0
Pelosi new coronavirus bill allows illegals receive billions relief funds past current future payments 2020-05-12 Facebook post

claim_id = 1
317 criminal indictments recent Republican presidents indictments recent Democratic presidents 2019-12-29 Facebook posts

claim_id = 2
Nov. 3 coronavirus magically sudden away disappear 2020-05-16 Eric Trump

claim_id = 3
North Carolina small minority states requires absentee ballot signed witnesses notary public 2020-04-10 Jay Chaudhuri

claim_id = 4
Clearly Obama administration leave kind game plan like 2020-05-11 Mitch McConnell

[2020-07-18 02:07:11,065] INFO:root: 
[2020-07-18 02:07:11,066] INFO:root: ................................................................................
[2020-07-18 02:07:11,066] INFO:root: ..........................




HBox(children=(FloatProgress(value=0.0, description='running spacy on articles', max=1927.0, style=ProgressSty…

# using msmarco like rerank

In [4]:
def rerank_support_examples(claims, rerank_top_n):
    examples = []
    for claim in tqdm(claims):
        for i, rel_art in claim.related_articles.items():
            for sup in claim.support[rel_art][:rerank_top_n]:
                examples.append(
                    SequenceClassificationExample(
                        guid=claim.id,
                        text_a=claim.text_a.text,
                        text_b=sup["text"],
                        art_id=rel_art,
                    )
                )
    return examples


def rerank_msmarco(claims, rerank_model_dir, predict_batch_size, rerank_top_n, keep_top_n, nproc):
    examples = rerank_support_examples(claims, rerank_top_n)
    _logger.info(
        "first 5 rerank examples:\n%s",
        json.dumps([example.__dict__ for example in examples[:5]], indent=2),
    )

    probabilities = _sequence_classification(
        examples, rerank_model_dir, predict_batch_size=predict_batch_size, nproc=nproc,
    )

    if len(probabilities) != len(examples):
        raise ValueError(
            "len predictions ({}) != len examples ({})".format(
                len(probabilities), len(examples)
            )
        )

    # we use this for initialization instead of collection.defaultdict(list)
    # because there is a possibilty that a claim had a null responses when
    # queried, which means it has no rerank examples, in which case we still
    # want it in the below dict, but would be empty
    reranked_support_dict = {claim.id: [] for claim in claims}
    for example, proba in tqdm(zip(examples, probabilities)):
        proba = float(proba[1])  # gets relatedness score of example
        reranked_support_dict[example.guid].append(
            {"art_id": example.art_id, "text": example.text_b, "score": proba}
        )

    for k, sents in reranked_support_dict.items():
        reranked_support_dict[k] = heapq.nlargest(keep_top_n, sents, key=lambda x: x["score"])

    return reranked_support_dict

In [5]:
reranked_support_dict_msmarco = rerank_msmarco(
    claims=claims,
    rerank_model_dir=parser_args.rerank_model_dir,
    predict_batch_size=parser_args.predict_batch_size,
    rerank_top_n=10,
    keep_top_n=2,
    nproc=parser_args.nproc,
)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[2020-07-18 01:56:17,151] INFO:root: first 5 rerank examples:
[
  {
    "guid": 0,
    "text_a": "\u201cPelosi\u2019s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.\u201d Facebook post 2020-05-12",
    "text_b": "the exclusion of nearly 5.5 million u.s. citizens and green-card holders from the cares act stimulus is a point some in congress are seeking to revisit in future pandemic-relief legislation.",
    "label": null,
    "art_id": "https://www.migrationpolicy.org/article/covid19-immigrants-shut-out-federal-relief"
  },
  {
    "guid": 0,
    "text_a": "\u201cPelosi\u2019s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.\u201d Facebook post 2020-05-12",
    "text_b": "and for a look at the u.s. citizens and legal permanent residents in mixed-status families who could become eligible for cares act payments under legislation introduced by sen. marco rubio (r-




[2020-07-18 01:56:25,080] INFO:transformers.modeling_utils: All model checkpoint weights were used when initializing BertForSequenceClassification.

[2020-07-18 01:56:25,082] INFO:transformers.modeling_utils: All the weights of BertForSequenceClassification were initialized from the model checkpoint at models/rerank/castroni/monobert-large-msmarco.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
[2020-07-18 01:56:25,084] INFO:valerie.modeling: ... converting examples to features ...


HBox(children=(FloatProgress(value=0.0, description='converting examples to features', max=1987.0, style=Progr…




[2020-07-18 01:56:27,240] INFO:transformers.training_args: PyTorch: setting up devices
[2020-07-18 01:56:27,574] INFO:transformers.trainer: Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Wandb version 0.9.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[2020-07-18 01:56:28,027] INFO:wandb.run_manager: system metrics and metadata threads started
[2020-07-18 01:56:28,038] INFO:transformers.trainer: ***** Running Prediction *****
[2020-07-18 01:56:28,039] INFO:transformers.trainer:   Num examples = 1987
[2020-07-18 01:56:28,040] INFO:transformers.trainer:   Batch size = 1024


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=2.0, style=ProgressStyle(description_wid…

[2020-07-18 01:56:28,587] INFO:wandb.run_manager: file/dir modified: /home/jay/.local/.ml/wandb/dryrun-20200718_055526-3425rg3g/wandb-metadata.json





HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

[2020-07-18 01:56:40,279] INFO:wandb.run_manager: shutting down system stats and metadata service





# using sentence transformers

In [6]:
from sentence_transformers import SentenceTransformer
import scipy.spatial

In [7]:
def rerank_st(claims, keep_top_n):
    def generate_claim_st_text(claim):
        text = claim.claim
        if claim.claimant:
            text += " "
            text += claim.claimant
        return clean_text(text)
    
    reranked_support_dict = {claim.id: [] for claim in claims}
    embedder = SentenceTransformer("bert-large-nli-mean-tokens")
    for claim in tqdm(claims):
        claim_embedding = embedder.encode(generate_claim_st_text(claim), show_progress_bar=False)[0]
        for art_id in claim.related_articles.values():
            support_texts = [sup['text'] for sup in claim.support[art_id]]
            support_embeddings = embedder.encode(support_texts, show_progress_bar=False)
            distances = scipy.spatial.distance.cdist([claim_embedding], support_embeddings, "cosine")[0]
            for text, score in zip(support_texts, distances):
                reranked_support_dict[claim.id].append({"art_id": art_id, "text": text, "score": 1-score})
    
    for k, v in reranked_support_dict.items():
        reranked_support_dict[k] = heapq.nlargest(keep_top_n, v, key=lambda x: x["score"])
    return reranked_support_dict

In [8]:
reranked_support_dict_st = rerank_st(claims, 2)

[2020-07-18 01:56:41,080] INFO:root: Load pretrained SentenceTransformer: bert-large-nli-mean-tokens
[2020-07-18 01:56:41,081] INFO:root: Did not find a '/' or '\' in the name. Assume to download model from server.
[2020-07-18 01:56:41,083] INFO:root: Load SentenceTransformer from folder: /home/jay/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-mean-tokens.zip
[2020-07-18 01:56:41,092] INFO:transformers.configuration_utils: loading configuration file /home/jay/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-mean-tokens.zip/0_BERT/config.json
[2020-07-18 01:56:41,093] INFO:transformers.configuration_utils: Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
 

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




# analyze results

In [9]:
from valerie.utils import stats

In [10]:
for claim in claims:
    def print_shallow_dict(d):
        for k, v in d.items():
            if k not in ["claim", "text"]:
                print((k + ":").ljust(10),  v)
        if "claim" in d:
            print(("claim:").ljust(10),  d["claim"])
        if "text" in d:
            print(("text:").ljust(10),  d["text"])

    print("claim")
    print("-----")
    print_shallow_dict({k: v for k, v in claim.__dict__.items() if k in ["id", "claim", "claimant", "date", "label"]})
    print()
    for i, rel_art in enumerate(claim.related_articles.values()):
        print()
        print("{}\nart{} spacy".format(rel_art,i+1))
        print("----------")
        for sup in claim.support[rel_art][:2]:
            print_shallow_dict(sup)
            print()
        print()
    print("msmarco")
    print("--------------------")
    for sup in reranked_support_dict_msmarco[claim.id]:
        print_shallow_dict(sup)
        print()
    print()
    print()
    print("sentence transformer")
    print("--------------------")
    for sup in reranked_support_dict_st[claim.id]:
        print_shallow_dict(sup)
        print()
    print("\n\n\n\n")
    print("#"*80)
    print("\n\n\n\n")

claim
-----
id:        0
claimant:  Facebook post
label:     None
date:      2020-05-12 00:00:00
claim:     “Pelosi’s new coronavirus bill allows illegals to receive billions in relief funds in past, current, and future payments.”


https://www.migrationpolicy.org/article/covid19-immigrants-shut-out-federal-relief
art1 spacy
----------
score:     0.912794063169276
text:      the exclusion of nearly 5.5 million u.s. citizens and green-card holders from the cares act stimulus is a point some in congress are seeking to revisit in future pandemic-relief legislation.

score:     0.901594346871941
text:      and for a look at the u.s. citizens and legal permanent residents in mixed-status families who could become eligible for cares act payments under legislation introduced by sen. marco rubio (r-fl) and rep. mario diaz-balart (r-fl), access mpi's u.s. and state estimates mpi estimates that due to the restriction in the cares act, 15.4 million people will be excluded from the stimulus paymen