In [1]:
from pipeline import *
from pipeline import _logger, _sequence_classification

[2020-07-18 02:08:24,512] INFO:root: 
[2020-07-18 02:08:24,514] INFO:root: ................................................................................
[2020-07-18 02:08:24,514] INFO:root: ................................ loading spacy .................................
[2020-07-18 02:08:24,515] INFO:root: ................................................................................
[2020-07-18 02:08:24,516] INFO:root: 


In [2]:
import types

parser_args = types.SimpleNamespace()
parser_args.metadata_file = "data/phase2-validation-100/raw/metadata.json"
parser_args.predictions_file = "logs/predictions.json"
parser_args.claimant_model_file = "models/claimant_model.json"
parser_args.fnc_model_dir = "models/phase2/single-claim-claimant-date/roberta-large-combined-128-2"
parser_args.rerank_model_dir = "models/rerank/castroni/monobert-large-msmarco"
parser_args.predict_batch_size = 128
parser_args.nproc = 8

In [5]:
del rerank_hits

def generate_rerank_examples(claims):
    def generate_text_b(article, claim):
        text_b = clean_text(" ".join([s["text"] for s in claim.support[article.id]]))
        return text_b

    examples = []
    for claim in tqdm(claims, desc="generating rerank examples"):
        for hit in claim.hits:
            article = hit["article"]
            article.text_b = generate_text_b(article, claim)

            examples.append(
                SequenceClassificationExample(
                    guid=claim.id,
                    text_a=claim.text_a.text,
                    text_b=article.text_b,
                    art_id=article.id,
                )
            )
    return examples

def rerank_hits(claims, rerank_model_dir, predict_batch_size, keep_top_n, nproc):
    examples = generate_rerank_examples(claims)
    _logger.info(
        "first 5 rerank examples:\n%s",
        json.dumps([example.__dict__ for example in examples[:5]], indent=2),
    )

    probabilities = _sequence_classification(
        examples, rerank_model_dir, predict_batch_size=predict_batch_size, nproc=nproc,
    )

    if len(probabilities) != len(examples):
        raise ValueError(
            "len predictions ({}) != len examples ({})".format(
                len(probabilities), len(examples)
            )
        )

    # we use this for initialization instead of collection.defaultdict(list)
    # because there is a possibilty that a claim had a null responses when
    # queried, which means it has no rerank examples, in which case we still
    # want it in the below dict, but would be empty
    rerank_hits_dict = {claim.id: [] for claim in claims}
    for example, proba in tqdm(zip(examples, probabilities)):
        proba = float(proba[1])  # gets relatedness score of example
        rerank_hits_dict[example.guid].append(
            {"art_id": example.art_id, "score": proba}
        )

    for k, hits in rerank_hits_dict.items():
        top_n_hits = heapq.nlargest(keep_top_n, hits, key=lambda x: x["score"])
        rerank_hits_dict[k] = {x["art_id"]: x["score"] for i, x in enumerate(top_n_hits)}

    return rerank_hits_dict

In [10]:
#####################
### read metadata ###
#####################
log_title(_logger, "reading claims from {}".format(parser_args.metadata_file))
claims = get_claims(metadata_file=parser_args.metadata_file)
log_msg = ""
for i, claim in enumerate(claims):
    if i >= 5:
        break
    log_msg += "\n{}\n".format(claim.logstr())
_logger.info("first 5 claims:\n%s", log_msg)

######################
### process claims ###
######################
log_title(_logger, "process claims")
claim_docs_dict = generate_claim_docs_dict(claims=claims)
log_msg = ""
for i, claim in enumerate(claims):
    claim.doc = claim_docs_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.doc.text)
_logger.info("first 5 claim doc texts:\n%s", log_msg)

claim_text_a_dict = generate_text_a_dict(claims=claims)
log_msg = ""
for i, claim in enumerate(claims):
    claim.text_a = claim_text_a_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.text_a)
_logger.info("first 5 claim text_a:\n%s", log_msg)

#######################
### fetch responses ###
#######################
log_title(_logger, "fetching query responses")
queries, responses = get_responses(claims=claims, nproc=parser_args.nproc)

log_msg = ""
null_responses = 0
for i, claim in enumerate(claims):
    claim.query = queries[claim.id]
    claim.res = responses[claim.id]

    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(claim.id, claim.query)

    if not claim.res:
        null_responses += 1

_logger.info("  num_claims:         %d", len(claims))
_logger.info("  null_responses:     %d", null_responses)
_logger.info("first 5 generated queries:\n%s", log_msg)

########################
### process articles ###
########################
log_title(_logger, "process related articles")
articles_dict = get_articles_dict(claims=claims)
_logger.info("  num articles in articles_dict:  %d", len(articles_dict))

hits_dict = get_hits_dict(claims=claims, articles_dict=articles_dict)
log_msg = ""
num_total_hits = 0
no_hits_claims = 0
for i, claim in enumerate(claims):
    claim.hits = hits_dict[claim.id]
    num_total_hits += len(claim.hits)
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(
            claim.id,
            json.dumps(
                [
                    {"score": hit["score"], "url": hit["url"]}
                    for hit in claim.hits[:5]
                ],
                indent=2,
            ),
        )
    if not claim.hits:
        no_hits_claims += 1
_logger.info("  num_total_hits:     %d", num_total_hits)
_logger.info("  no_hits_claims:     %d", no_hits_claims)
_logger.info("first 5 claim hits (resticted to top 5 hits per claim):\n%s", log_msg)

article_docs_dict = generate_article_docs_dict(articles=articles_dict.values())
for article in articles_dict.values():
    article.doc = article_docs_dict[article.id]

########################
### generate support ###
########################
log_title(_logger, "generating support")
support_dict = generate_support_dict(claims=claims, keep_top_n_sentences=32)
log_msg = ""
for i, claim in enumerate(claims):
    claim.support = support_dict[claim.id]
    if i < 5:
        log_msg += "\nclaim_id = {}\n{}\n".format(
            claim.id,
            json.dumps(
                {k: v[:3] for k, v in list(claim.support.items())[:3]}, indent=2
            ),
        )
_logger.info(
    "first 5 claim support (top 3 articles, top 3 sentences):\n%s", log_msg
)

##############
### rerank ###
##############
log_title(_logger, "rerank and select top 2 articles")
rerank_hits_dict = rerank_hits(
    claims=claims,
    rerank_model_dir=parser_args.rerank_model_dir,
    predict_batch_size=parser_args.predict_batch_size,
    keep_top_n=2,  # make sure we only keep the top two results
    nproc=parser_args.nproc,
)
log_msg = ""
for i, claim in enumerate(claims):
    claim.related_articles = rerank_hits_dict[claim.id]
    if i < 5:
        for j, article_id in claim.related_articles.items():
            log_msg += "\nclaim_id = {}\narticle #={}\n{}\n".format(
                claim.id, str(j), articles_dict[article_id].logstr(),
            )
_logger.info("first 5 claims chosen reranked articles:\n%s", log_msg)

###############################
### sequence classification ###
###############################
log_title(_logger, "generating sequence classification predictions")
seq_clf_predictions, seq_clf_explanations = sequence_classification(
    claims=claims,
    fnc_model_dir=parser_args.fnc_model_dir,
    predict_batch_size=parser_args.predict_batch_size,
    nproc=parser_args.nproc,
)
log_msg = ""
for i, claim in enumerate(claims):
    if i > 5:
        break
    log_msg += "\nclaim_id = {}\npred: {}\nexplanation: {}\n".format(
        claim.id,
        str(seq_clf_predictions[claim.id]),
        seq_clf_explanations[claim.id],
    )
_logger.info("first 5 sequence classification output results:\n%s", log_msg)

###############################
### claimant classification ###
###############################
log_title(_logger, "generating claimant predictions")
claimant_predictions, claimant_explanations = claimant_classification(
    claims=claims, claimant_model_file=parser_args.claimant_model_file
)
log_msg = ""
for i, claim in enumerate(claims):
    if i > 5:
        break
    log_msg += "\nclaim_id = {}\npred: {}\nexplanation: {}\n".format(
        claim.id,
        str(claimant_predictions[claim.id]),
        claimant_explanations[claim.id],
    )
_logger.info("first 5 claimant classification output results:\n%s", log_msg)

In [8]:
with open("data/phase2-validation-100/raw/labels.json") as fi:
    labels_json = {int(k): v for k, v in json.load(fi).items()}

for claim in claims:
    claim.label = labels_json[claim.id]["label"]
    claim.related_articles_labels = labels_json[claim.id]["related_articles"]

In [11]:
claims = sorted(claims, key=lambda x: max(list(x.related_articles.values())), reverse=True)

In [41]:
def print_shallow_dict(d):
    for k, v in d.items():
        if k not in ["related_articles", "related_articles_labels", "support"]:
            print((k + ":").ljust(10),  v)
#     if "related_articles_labels" in d:
#         print(("related_articles_labels:").ljust(10), json.dumps(d["related_articles_labels"], indent=2))
    print()
    for art_id, art_score in d["related_articles"].items():
        print("{:.2f}: {}".format(art_score, art_id))
        print("-"*100)
        print()
        for sup in d["support"][art_id][:2]:
            print("{:.2f} - {}".format(sup["score"], sup["text"]))
            print()
        print()
        print()

In [42]:
for claim in claims:
    print_shallow_dict({
        k: v
        for k, v in claim.__dict__.items()
        if k
        in [
            "claim",
            "claimant",
            "date",
            "id",
            "related_articles_labels",
            "label",
            "support",
            "related_articles",
        ]
    })
    print()
    print()
    print()

id:        3
claim:     North Carolina is "in the small minority of states that requires an absentee ballot to be signed by two witnesses or a notary public."
claimant:  Jay Chaudhuri
label:     2
date:      2020-04-10 00:00:00

4.03: https://medium.com/@leslieedwardsrudd/berger-sends-the-wrong-message-on-election-protection-e5bbeb9c9009
----------------------------------------------------------------------------------------------------

0.96 - north carolina is in the small minority of states that requires an absentee ballot to be signed by two witnesses or a notary public.

0.88 - republicans send wrong message on protecting elections by jay chaudhuri (d — wake) and natasha marcus (d — mecklenburg) in north carolina, we’re carrying out a comprehensive and aggressive strategy against covid-19 that’s saving lives and keeping our citizens safe.



3.39: https://www.elitedaily.com/p/is-there-voter-fraud-in-north-carolina-these-reports-are-complicated-13244080
----------------------------