# Copied From Query.py

In [None]:
import json
import pickle
import random
import argparse
import multiprocessing

import spacy
from tqdm.auto import tqdm

from valerie import search
from valerie.data import Article
from valerie.datasets import name_to_dataset
from valerie.utils import get_logger
from valerie.scoring import validate_predictions_phase2, compute_score_phase2
from valerie.preprocessing import clean_text

In [2]:
_logger = get_logger()
nlp = spacy.load("en_core_web_lg")

In [3]:
def compute_responses_score(responses):
    predictions = {}
    perfect_predictions = {}
    labels = {}

    for v in responses:
        claim = v["claim"]
        labels[claim.id] = claim.to_dict()
        predictions[claim.id] = {
            "label": claim.label,
            "related_articles": {
                i + 1: x
                for i, x in enumerate(
                    [hit["url"] for hit in v["res"]["hits"]["hits"][:2]]
                )
            }
            if v["res"]
            else {},
            "explanation": "",
        }
        perfect_predictions[claim.id] = {
            "label": claim.label,
            "related_articles": {
                i + 1: x
                for i, x in enumerate(
                    [
                        hit["url"]
                        for hit in v["res"]["hits"]["hits"]
                        if hit["url"] in claim.related_articles.values()
                    ][:2]
                )
            }
            if v["res"]
            else {},
            "explanation": "",
        }

    validate_predictions_phase2(predictions)
    score = compute_score_phase2(labels, predictions)
    validate_predictions_phase2(perfect_predictions)
    perfect_score = compute_score_phase2(labels, perfect_predictions)
    return {
        "perfect_rerank_score": perfect_score["score"],
        "perfect_rerank_error": perfect_score["error"],
        "api_score": score["score"],
        "api_error": score["error"],
    }

def convert_html_hits_to_article(res):
    visited = set()
    output = []

    for hit in res["hits"]["hits"]:
        if hit["url"] in visited:
            continue

        article = Article.from_html(hit["url"], hit["content"], url=hit["url"])
        if not article.content or len(article.content) < 32:
            continue

        output.append({"score": hit["score"], "article": article, "url": hit["url"]})
        visited.add(hit["url"])

    return output


def pipeline(claim):
    query = query_expansion(claim)
    res = search.query(query)
    if res:
        res["hits"]["hits"] = convert_html_hits_to_article(res)
    return claim, query, res

In [8]:
def main(dataset_name="Phase2ValidationDataset", truncate=None, nproc=8):
    dataset_class = name_to_dataset[dataset_name]
    claims = dataset_class.from_raw().claims
    if truncate:
        claims = random.sample(claims, k=truncate)

    pool = multiprocessing.Pool(nproc)
    responses = []
    for claim, query, res in tqdm(
        pool.imap_unordered(pipeline, claims),
        total=len(claims),
        desc="fetching responses",
    ):
        responses.append({"claim": claim, "res": res, "query": query})

#     with open(args.output_file, "wb") as fo:
#         pickle.dump(responses, fo)

    _logger.warning("Missed Queries: %d", sum(1 for v in responses if v["res"] is None))
    _logger.info("Scores: %s", json.dumps(compute_responses_score(responses), indent=2))

# No QE

In [9]:
def query_expansion(claim):
    return claim.claim

In [10]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:19:04,859] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:19:04,859] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:21:07,370] INFO:root: Scores: {
  "perfect_rerank_score": 1.0441553640250065,
  "perfect_rerank_error": "'None'",
  "api_score": 0.5477855697093302,
  "api_error": "'None'"
}


# Stopword Removal

In [11]:
def query_expansion(claim):
    claim_doc = nlp(claim.claim, disable=["textcat", "tagger", "parser", "ner"])

    # stopword removal
    query_words = [token.text for token in claim_doc if not token.is_stop]
    query = " ".join([t for t in query_words if t])

    return query

In [12]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:22:50,656] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:22:50,657] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:24:43,175] INFO:root: Scores: {
  "perfect_rerank_score": 1.0108953197043262,
  "perfect_rerank_error": "'None'",
  "api_score": 0.5283960558501345,
  "api_error": "'None'"
}


# Stopword Removal + Clean Text

In [13]:
def query_expansion(claim):
    claim_doc = nlp(claim.claim, disable=["textcat", "tagger", "parser", "ner"])

    # stopword removal
    query_words = [token.text for token in claim_doc if not token.is_stop]
    query = clean_text(
        " ".join(
            [
                t
                for t in query_words
                if t and not len(clean_text(t, remove_punctuation=True)) == 0
            ]
        )
    )

    return query

In [14]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:24:43,485] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:24:43,486] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:26:53,644] INFO:root: Scores: {
  "perfect_rerank_score": 1.1531107315227402,
  "perfect_rerank_error": "'None'",
  "api_score": 0.6130484828884987,
  "api_error": "'None'"
}


# Claimant + Stopword Removal + Clean Text

In [15]:
def query_expansion(claim):
    claim_doc = nlp(claim.claim, disable=["textcat", "tagger", "parser", "ner"])

    # stopword removal
    query_words = [token.text for token in claim_doc if not token.is_stop]
    query = clean_text(
        " ".join(
            [
                t
                for t in query_words
                if t and not len(clean_text(t, remove_punctuation=True)) == 0
            ]
        )
    )
    
    if claim.claimant:
        query += " " + claim.claimant

    return query

In [16]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:26:53,963] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:26:53,964] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:29:06,245] INFO:root: Scores: {
  "perfect_rerank_score": 1.168343222165067,
  "perfect_rerank_error": "'None'",
  "api_score": 0.6395182603952109,
  "api_error": "'None'"
}


# Date + Stopword Removal + Clean Text

In [17]:
def query_expansion(claim):
    claim_doc = nlp(claim.claim, disable=["textcat", "tagger", "parser", "ner"])

    # stopword removal
    query_words = [token.text for token in claim_doc if not token.is_stop]
    query = clean_text(
        " ".join(
            [
                t
                for t in query_words
                if t and not len(clean_text(t, remove_punctuation=True)) == 0
            ]
        )
    )

    if claim.date:
        query += " " + claim.date.split(" ")[0].split("T")[0]

    return query

In [18]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:29:06,438] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:29:06,439] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:31:13,885] INFO:root: Scores: {
  "perfect_rerank_score": 1.1414155993456803,
  "perfect_rerank_error": "'None'",
  "api_score": 0.6440130304440003,
  "api_error": "'None'"
}


# Claimant + Date + Stopword Removal + Clean Text

In [19]:
def query_expansion(claim):
    claim_doc = nlp(claim.claim, disable=["textcat", "tagger", "parser", "ner"])

    # stopword removal
    query_words = [token.text for token in claim_doc if not token.is_stop]
    query = clean_text(
        " ".join(
            [
                t
                for t in query_words
                if t and not len(clean_text(t, remove_punctuation=True)) == 0
            ]
        )
    )

    if claim.date:
        query += " " + claim.date.split(" ")[0].split("T")[0]

    if claim.claimant:
        query += " " + claim.claimant

    return query

In [20]:
main()

HBox(children=(FloatProgress(value=0.0, description='Phase2ValidationDataset to claims', max=500.0, style=Prog…


[2020-07-11 19:31:14,105] INFO:valerie.datasets: missed row to claim conversions: 0
[2020-07-11 19:31:14,106] INFO:valerie.datasets: Phase2ValidationDataset claims set change 500 --> 500


HBox(children=(FloatProgress(value=0.0, description='fetching responses', max=500.0, style=ProgressStyle(descr…


[2020-07-11 19:33:25,746] INFO:root: Scores: {
  "perfect_rerank_score": 1.1557801493610302,
  "perfect_rerank_error": "'None'",
  "api_score": 0.6617083840074162,
  "api_error": "'None'"
}
