# Model 02: Fast evidence shortlisting by information extraction

Same as 02a, adding in IDF scoring.

**This is the final implementation for the Shortlisting Stage**

References:
- [Universal dependencies scheme](https://universaldependencies.org/u/pos/)

In [None]:
# Change the working directory to project root
from pathlib import Path
import os
ROOT_DIR = Path.cwd()
while not ROOT_DIR.joinpath("src").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

In [None]:
# Dependencies
import json
from sklearn.model_selection import ParameterGrid
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from typing import List, Dict, DefaultDict
from dataclasses import dataclass
from tqdm import tqdm
from multiprocessing.pool import ThreadPool as Pool
from math import floor

from src.data import load_as_dataframe, slice_by_claim, SetEncoder
from src.normalize import normalize_pipeline
from src.ner import \
    train_noun_relations, \
    get_evidence_by_noun, \
    retrieve_claim_evidence_by_noun, \
    view_claim_noun_phrases

In [None]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

In [None]:
test = nlp("there")

In [None]:
for t in test:
    print(t.pos_)

# Load datasets

In [None]:
data_names = ["train-claims", "dev-claims", "evidence"]
train_claims, dev_claims, all_evidences \
    = load_as_dataframe(data_names, full_evidence=True)

## Visualise dependencies

In [None]:
pairs = dev_claims.loc["claim-1896"].reset_index()
pair_index = 0
pairs

In [None]:
claim_doc = nlp(normalize_pipeline(pairs.iloc[pair_index]["claim_text"]))
evidence_doc = nlp(normalize_pipeline(pairs.iloc[pair_index]["evidence_text"]))
displacy.render(claim_doc, style="dep")
displacy.render(evidence_doc, style="dep")

## Info Tag extraction

In [None]:
@dataclass
class InfoTag:
    tag:str
    verb_pos:int = 0

In [None]:
def get_info_tags(doc, go_nouns:List[str] = []) -> List[InfoTag]:
    """
    Gets info tags (keywords) from a spacy doc, optionally taking
    a set of go nouns that must be included regardless of its POS tag.
    """
    
    info_tags = list()
    seen = list()
    seen_tags = list()
    
    def add_tag(tag_txt:str, info_tags=info_tags, seen_tags=seen_tags):
        if tag_txt not in seen_tags:
            info_tags += [InfoTag(tag = tag_txt, verb_pos=verb_pos)]
            seen_tags += [tag_txt]
        return
    
    verb_pos = 0
    for token in (doc):
        
        # Skip if we have seen the token before
        if token.lemma_ in seen:
            continue
        
        # Increment the relative verb position
        if token.pos_ in ["VERB"]:
            verb_pos += 1
        
        # Include it if it is in the whitelist of go_nouns
        if token.lemma_ in go_nouns:
            # info_tags += [InfoTag(tag=token.lemma_, verb_pos=verb_pos)]
            tag_txt = token.lemma_
            add_tag(tag_txt)
        
        # Include it if it is a (proper)noun
        if token.pos_ in ["PROPN", "NOUN"]:
            # info_tags += [InfoTag(tag=token.lemma_, verb_pos=verb_pos)]
            tag_txt = token.lemma_
            add_tag(tag_txt)
        
        # Merge proper noun compounds
        if token.pos_ in ["PROPN"]:
            tag = [token.lemma_]
            seen += [token.lemma_]
            this_token = token
            while this_token.dep_ in ["compound"]:
                this_token = this_token.head
                tag += [this_token.lemma_]
                seen += [this_token.lemma_]
            tag_txt = " ".join(tag)
            add_tag(tag_txt)
            # continue
        
        # Merge noun compounds
        if (token.pos_ in ["NOUN"]
            and token.dep_ in ["compound"]
            and token.head.pos_ == "NOUN"
        ):
            tag = [token.lemma_, token.head.lemma_]
            seen += tag
            # info_tags += [InfoTag(tag = " ".join(tag), verb_pos=verb_pos)]
            tag_txt = " ".join(tag)
            add_tag(tag_txt)
            # continue
        
        # Merge Nouns with adjective modifiers
        if (
            token.pos_ in ["ADJ"]
            and token.dep_ in ["amod"]
            and token.head.pos_ in ["NOUN"]
            and token.head.dep_ not in ["compound"]
        ):
            tag = [token.lemma_, token.head.lemma_]
            seen += tag
            # info_tags += [InfoTag(tag = " ".join(tag), verb_pos=verb_pos)]
            tag_txt = " ".join(tag)
            add_tag(tag_txt)
            # continue
        
        # Adjectives linked to verbs and nouns
        if (
            token.pos_ in ["ADJ"]
            and token.head.pos_ in ["VERB", "NOUN"]
        ):
            tag = [token.lemma_]
            seen += tag
            # info_tags += [InfoTag(tag = " ".join(tag), verb_pos=verb_pos)]
            tag_txt = " ".join(tag)
            add_tag(tag_txt)
            # continue
        
        continue

    return info_tags


In [None]:
claim_tags = get_info_tags(claim_doc)
claim_tags

In [None]:
evidence_tags = get_info_tags(evidence_doc)
evidence_tags

## Training

In [None]:
DATA_PATH = ROOT_DIR.joinpath("./data/*")
SAVE_PATH = ROOT_DIR.joinpath("./result/*")

### Info tag pipeline

In [None]:
def info_tag_pipeline(
    text:str,
    return_doc:bool=False
) -> List[InfoTag]:
    """
    A pipeline that applies text preprocessing then extracts keywords.
    """
    text = normalize_pipeline(text)
    doc = nlp(text)
    tags = get_info_tags(doc)
    
    if return_doc:
        return tags, doc
    else:
        return tags

### Tag evidences

Creates the keyword-evidence index by matching keywords using POS tags.

In [None]:
def tag_evidences(
    evidence_path:Path,
    save_path:Path = None,
    processes:int = 8,
    verbose:bool = True
) -> DefaultDict[str, set]:
    
    # Load the evidence file
    with open(evidence_path, mode="r") as f:
        evidences = json.load(f)
    
    # Cumulator
    evidence_tags = defaultdict(set)
    
    evidences_iter = tqdm(evidences.items(), desc="claims", disable=not verbose)
    for evidence_id, evidence_text in evidences_iter:
        
        tags = info_tag_pipeline(text=evidence_text)
        
        for tag in tags:
            evidence_tags[tag.tag].add(evidence_id)
        
        evidences_iter.postfix = f"n_tags: {len(tags)}"
        
        continue
    
    if save_path:
        with open(save_path, mode="w") as f:
            json.dump(obj=evidence_tags, fp=f, cls=SetEncoder)
            print(f"saved to: {save_path}")

    return evidence_tags


In [None]:
tagged_evidences_path = SAVE_PATH.with_name("tagged_evidences.json")

if tagged_evidences_path.exists():
    print(f"existing found: {tagged_evidences_path}")
    
    # with open(tagged_evidences_path, mode="r") as f:
    #     tagged_evidences = json.load(f)
    #     print(f"loaded: {tagged_evidences_path}")
        
else:
    
    tagged_evidences = tag_evidences(
        evidence_path=DATA_PATH.with_name("evidence.json"),
        save_path=tagged_evidences_path
    )
# tagged_evidences

### Tag evidences with char n-gram

Create forward and reverse character n-grams to be used as keywords in the keyword-evidence index.

In [None]:
def get_bidirectional_n_grams(doc, n:int=4):
    fwd_ngrams = [token.lemma_[:n] for token in doc if len(token.lemma_) >= n]
    rev_ngrams = [token.lemma_[-n:] for token in doc if len(token.lemma_) >= n]
    return fwd_ngrams, rev_ngrams

In [None]:
def get_ngram_evidences(
    evidence_path:Path,
    n_list:list = [4, 5, 6],
    save_path_fwd:Path = None,
    save_path_rev:Path = None,
    verbose:bool = True
):
    # Load the evidence file
    with open(evidence_path, mode="r") as f:
        evidences = json.load(f)
    
    # Cumulator
    fwd_evidence_ngrams = defaultdict(set)
    rev_evidence_ngrams = defaultdict(set)
    
    evidences_iter = tqdm(evidences.items(), desc="claims", disable=not verbose)
    for evidence_id, evidence_text in evidences_iter:
        
        text = normalize_pipeline(evidence_text)
        doc = nlp(text)
        
        for n in n_list:
            fwd_ngrams, rev_ngrams = get_bidirectional_n_grams(doc, n=n)
            
            for ngram in fwd_ngrams:
                fwd_evidence_ngrams[ngram].add(evidence_id)
                
            for ngram in rev_ngrams:
                rev_evidence_ngrams[ngram].add(evidence_id)
            
        continue
    
    if save_path_fwd and save_path_rev:
        with open(save_path_fwd, mode="w") as f:
            json.dump(obj=fwd_evidence_ngrams, fp=f, cls=SetEncoder)
            print(f"saved to: {save_path_fwd}")
        with open(save_path_rev, mode="w") as f:
            json.dump(obj=rev_evidence_ngrams, fp=f, cls=SetEncoder)
            print(f"saved to: {save_path_rev}")
    
    return fwd_evidence_ngrams, rev_evidence_ngrams

In [None]:
fwd_ngram_evidences_path = SAVE_PATH.with_name("train_fwd_ngram_evidences.json")
rev_ngram_evidences_path = SAVE_PATH.with_name("train_rev_ngram_evidences.json")

if fwd_ngram_evidences_path.exists():
    print(f"existing found: {fwd_ngram_evidences_path}")
    
    # with open(fwd_ngram_evidences_path, mode="r") as f:
    #     fwd_ngram_evidences = json.load(f)
    #     print(f"loaded: {fwd_ngram_evidences_path}")
        
    # with open(rev_ngram_evidences_path, mode="r") as f:
    #     rev_ngram_evidences = json.load(f)
    #     print(f"loaded: {rev_ngram_evidences_path}")

else:
    
    fwd_ngram_evidences, rev_ngram_evidences = get_ngram_evidences(
        evidence_path=DATA_PATH.with_name("evidence.json"),
        n_list=[3, 4, 5, 6, 7, 8],
        save_path_fwd=fwd_ngram_evidences_path,
        save_path_rev=rev_ngram_evidences_path
    )
    
# fwd_ngram_evidences

### Get evidence shortlist by claim

In [None]:
def get_evidence_shortlist(
    claims_paths:List[Path],
    tagged_evidence_path:Path,
    fwd_ngram_evidences_path:Path,
    rev_ngram_evidences_path:Path,
    ngram_list:list = [3, 4, 5, 6, 7, 8],
    save_path:Path = None,
    n_total_evidences:int=1208827,
    max_retrieved:int=1000,
    verbose:bool = True
):
    """
    Creates evidence shortlists by matching keywords from a pre-computed
    keyword-evidence index (tagged_evidence_path). Applies IDF scoring to
    each matched keyword and incrementally increase the threshold for the
    sum of IDF scores until the number of evidences lie within the
    max_retrieved parameter (reported as the hyperparameter "m").
    """
    # Load the claims file
    claims = dict()
    for claims_path in claims_paths:
        with open(claims_path, mode="r") as f:
            claims.update(json.load(f))
        
    # Load the tagged evidences file
    with open(tagged_evidence_path, mode="r") as f:
        tagged_evidences = json.load(f)
    
    # Load the evidence ngrams files
    with open(fwd_ngram_evidences_path, mode="r") as f:
        fwd_ngram_evidences = json.load(f)
        print(f"loaded: {fwd_ngram_evidences_path}")

    with open(rev_ngram_evidences_path, mode="r") as f:
        rev_ngram_evidences = json.load(f)
        print(f"loaded: {rev_ngram_evidences_path}")
        
    def idf_score(doc_freq, n_doc:int=n_total_evidences):
        return np.log10(n_doc / doc_freq)
    
    # Cumulator
    claim_evidences = defaultdict(set)
    missed_retrievals = defaultdict(set)
    retrieval_counts = []
    retrieval_recalls = []
    all_unique_tags = set()
    
    claim_obj = tqdm(claims.items(), desc="claims", disable=not verbose)
    for claim_id, claim in claim_obj:

        # Get claim direct tags
        tags = set()
        claim_text = claim["claim_text"]
        claim_tags, claim_doc = info_tag_pipeline(text=claim_text, return_doc=True)
        for tag in claim_tags:
            tags.add(tag.tag)
            all_unique_tags.add(tag.tag) #!
        
        original_tags = tags.copy() #!

        # Keep a count of how many tags each evidence relates to
        # In this version, Counters acts as summation for IDF scores
        retrieved_evidence_counts = Counter()
        
        # Match tags -----------------------------------------------------
        
        # Go through all the tags then retrieve evidences
        for tag in tags:
            
            evidence_ids = list(set(tagged_evidences.get(tag, [])))
            
            for e_id in evidence_ids:
                retrieved_evidence_counts[e_id] += idf_score(len(evidence_ids))
            
            continue
        
        # Match ngrams ------------------------------------------------------
        
        # Ensure to match each token once, forwards and reverse, longest first
        fwd_ngram_matched_tokens = set()
        rev_ngram_matched_tokens = set()
        
        for token in claim_doc:
            token_lemma = token.lemma_
            
            for n in sorted(ngram_list, reverse=True):
                # Match forward and add IDF score
                fwd_lemma = token_lemma[:n]
                fwd_evidences = fwd_ngram_evidences.get(fwd_lemma, [])
                
                for e_id in fwd_evidences:
                    retrieved_evidence_counts[e_id] += idf_score(len(fwd_evidences))
                fwd_ngram_matched_tokens.add(token_lemma)
                    
                # Match reverse and add IDF score
                rev_lemma = token_lemma[-n:0]
                rev_evidences = rev_ngram_evidences.get(rev_lemma, [])
                
                for e_id in rev_evidences:
                    retrieved_evidence_counts[e_id] += idf_score(len(rev_evidences))
                rev_ngram_matched_tokens.add(token_lemma)
        
        # Wrapping up ------------------------------------------------------
            
        # Dynamically find the cutoff to return the maximum specified
        # number of evidences
        if len(retrieved_evidence_counts.keys()) > 1:
            retrieved_cut_cutoff = 1
            staged_retrievals = []
            searching_cutoff = True
            while searching_cutoff:
                current_staged_retrievals = [
                    e_id
                    for e_id, e_count in sorted(
                        retrieved_evidence_counts.items(),
                        key=lambda x: x[1], reverse=True
                    )
                    if e_count >= retrieved_cut_cutoff
                ]
                
                if len(current_staged_retrievals) < 1:
                    searching_cutoff = False
                
                staged_retrievals = current_staged_retrievals
                
                if len(current_staged_retrievals) > max_retrieved:
                    retrieved_cut_cutoff += 0.5
                    continue
            
                searching_cutoff = False
            
            # Add staged retrievals against the claim_id
            for e_id in staged_retrievals:
                claim_evidences[claim_id].add(e_id)

        
        # Count how many evidences have been retrieved for this claim
        n_retrieved = len(claim_evidences[claim_id])
        
        # Calculate some statistics
        recall = 1
        if "evidences" in claim.keys():
            truth_evidences = set(claim["evidences"])
            retrieved_evidences = set(claim_evidences[claim_id])
            missed = truth_evidences.difference(retrieved_evidences)
            recall = (len(truth_evidences) - len(missed)) / len(truth_evidences)
            
            if recall < 0.5:
                print(claim_id, recall, original_tags)
                
            missed_retrievals[claim_id].update(missed)
        
        retrieval_counts.append(n_retrieved)
        retrieval_recalls.append(recall)
        claim_obj.postfix = f"n_retrieved: {n_retrieved}, recall: {recall}"
        
        continue
    
    if save_path:
        with open(save_path, mode="w") as f:
            json.dump(obj=claim_evidences, fp=f, cls=SetEncoder)
            print(f"saved to: {save_path}")
    
    return claim_evidences, missed_retrievals, retrieval_counts, retrieval_recalls, all_unique_tags

In [None]:
max_retrieval = 1000
retrieved_evidences_path = SAVE_PATH.with_name(f"test_shortlist_evidences_max_{max_retrieval}.json")

if retrieved_evidences_path.exists():
    
    with open(retrieved_evidences_path, mode="r") as f:
        retrieved_evidences = json.load(f)
        print(f"loaded: {retrieved_evidences_path}")
        
else:
    
    retrieved_evidences, missed_retrievals, retrievals_counts, retrieval_recalls, all_unique_tags = \
    get_evidence_shortlist(
        claims_paths=[DATA_PATH.with_name("test-claims-unlabelled.json")],
        tagged_evidence_path=tagged_evidences_path,
        fwd_ngram_evidences_path=fwd_ngram_evidences_path,
        rev_ngram_evidences_path=rev_ngram_evidences_path,
        ngram_list=[4, 5, 6, 7, 8],
        save_path=retrieved_evidences_path,
        max_retrieved=max_retrieval,
    )
# retrieved_evidences

In [None]:
print(f"avg recall: {np.mean(retrieval_recalls)}")
print(f"min recall: {np.min(retrieval_recalls)}")
print(f"avg retrieved: {np.mean(retrievals_counts)}")
print(f"max retrieved: {np.max(retrievals_counts)}")