# Model 02: Fast evidence shortlisting by noun occurrence

In [1]:
# Change the working directory to project root
import pathlib
import os
ROOT_DIR = pathlib.Path.cwd()
while not ROOT_DIR.joinpath("src").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

In [2]:
# Dependencies
import json
from sklearn.model_selection import ParameterGrid
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data import load_as_dataframe, slice_by_claim
from src.ner import \
    train_noun_relations, \
    get_evidence_by_noun, \
    retrieve_claim_evidence_by_noun, \
    view_claim_noun_phrases

In [3]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

  from .autonotebook import tqdm as notebook_tqdm


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# Load datasets

In [4]:
data_names = ["train-claims", "dev-claims", "evidence"]
train_claims, dev_claims, all_evidences \
    = load_as_dataframe(data_names, full_evidence=True)

Loaded train-claims
Loaded dev-claims
Loaded evidence


## Visualise dependencies

In [5]:
pairs = slice_by_claim(train_claims, 0, 1).reset_index()
pairs

Unnamed: 0,claim,claim_text,claim_label,evidences,evidence_text
0,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-442946,At very high concentrations (100 times atmosph...
1,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-1194317,Plants can grow as much as 50 percent faster i...
2,claim-1937,Not only is there no scientific evidence that ...,DISPUTED,evidence-12171,Higher carbon dioxide concentrations will favo...


In [6]:
pairs = train_claims.loc["claim-102"].reset_index()

In [7]:
claim_doc = nlp(pairs.iloc[3]["claim_text"])
evidence_doc = nlp(pairs.iloc[3]["evidence_text"])
displacy.render(claim_doc, style="dep")
displacy.render(evidence_doc, style="dep")

## Get all nouns (concept)

In [8]:
all_nouns = set()
for _, row in train_claims.loc["claim-1013"].reset_index().iterrows():
    for text_type in ["claim_text", "evidence_text"]:
        doc = nlp(text=row[text_type])
        for token in doc:
            if not token.is_stop and token.pos_ in ["NOUN", "PROPN"]:
                all_nouns.add(token.lemma_)
        print(doc)
        print(all_nouns)
    break

Global sea level rise surged between November 2014 and February 2016, with the El Niño event helping the oceans rise by 15mm.
{'November', 'level', 'sea', 'Niño', 'ocean', 'event', 'rise', 'El', 'February'}
The 2014–16 El Niño was a warming of the eastern equatorial Pacific Ocean that resulted in unusually warm waters developing between the coast of South America and the International Date Line.
{'Ocean', 'coast', 'warming', '2014–16', 'sea', 'rise', 'El', 'Date', 'water', 'February', 'November', 'Niño', 'International', 'level', 'ocean', 'equatorial', 'event', 'Pacific', 'South', 'Line', 'America'}


## Get all nouns (implementation)

In [9]:
view_claim_noun_phrases(train_claims.loc["claim-997"], nlp)

View nouns in pipeline


                                                   

ever since 2012 scientists have been debating a complex and frankly explosive idea about how a warming planet will alter our weather one that if it s correct would have profound implications across the northern hemisphere and especially in its middle latitudes
{'planet', 'hemisphere', 'middle', 'weather', 'implication', 'warming', 'scientist', 'idea', 'latitude'}
whether this be due directly to the health disposition or constitution of our globe itself or to the weather from without as the new glacial cosmogony would teach us must remain a question for experts to debate if not settle
{'expert', 'cosmogony', 'weather', 'health', 'question', 'globe', 'disposition', 'constitution'}


ever since 2012 scientists have been debating a complex and frankly explosive idea about how a warming planet will alter our weather one that if it s correct would have profound implications across the northern hemisphere and especially in its middle latitudes
{'planet', 'hemisphere', 'middle', 'weather', 'im



## Training

In [10]:
OUTPUT_PATH = ROOT_DIR.joinpath("./result/ner/*")

In [11]:
noun_extraction_jobs = [
    # Train set
    {
        "df": train_claims,
        "e_path": pathlib.Path("./data/evidence.json"),
        "c_path": pathlib.Path("./data/train-claims.json"),
        "nr_path": OUTPUT_PATH.with_name("train_noun_relations.json"),
        "er_path": OUTPUT_PATH.with_name("evidence_by_noun.json"),
        "cr_path": OUTPUT_PATH.with_name("train_claim_evidence_retrieved.json"),
    },
    # Dev set
    {
        "df": train_claims,
        "e_path": pathlib.Path("./data/evidence.json"),
        "c_path": pathlib.Path("./data/dev-claims.json"),
        "nr_path": OUTPUT_PATH.with_name("train_noun_relations.json"),
        "er_path": OUTPUT_PATH.with_name("evidence_by_noun.json"),
        "cr_path": OUTPUT_PATH.with_name("dev_claim_evidence_retrieved.json"),
    },
    # Test set
    {
        "df": train_claims,
        "e_path": pathlib.Path("./data/evidence.json"),
        "c_path": pathlib.Path("./data/test-claims-unlabelled.json"),
        "nr_path": OUTPUT_PATH.with_name("train_noun_relations.json"),
        "er_path": OUTPUT_PATH.with_name("evidence_by_noun.json"),
        "cr_path": OUTPUT_PATH.with_name("test_claim_evidence_retrieved.json"),
    },
]

In [12]:
for job in noun_extraction_jobs:

    # Training for noun relations
    if not job["nr_path"].exists():
        train_noun_relations(
            dataset_df=job["df"],
            nlp=nlp,
            save_path=job["nr_path"]
        )
    
    # List of evidences for each noun
    if not job["er_path"].exists():
        get_evidence_by_noun(
            evidence_path=job["e_path"],
            nlp=nlp,
            save_path=job["er_path"]
        )

    continue

View some of the relations

In [13]:
QUERIES = ["climate", "nino", "ecosystem", "warming", "age", "pacific", "oscillation", "february", "mars", "year", "member"]
THRESHOLD = 0.12

for job in noun_extraction_jobs:
    with open(job["nr_path"], mode="r") as f:
        noun_relations = json.load(f)

    # Maximum relation count for all nouns
    max_all_rel = max([max(rel.values()) for rel in noun_relations.values()])

    # Look at some query results
    for query in QUERIES:
        
        query_relations = noun_relations[query]
        
        # Maximum relation count for query noun
        max_rel = max(query_relations.values())
        
        sorted_relations = [(k, v, round(v / max_rel, 2)) for k, v in sorted(
            query_relations.items(),
            key=lambda item: item[1], reverse=True
        )]
        print(f"{query} (score={max_rel / max_all_rel:2f}):")
        print(f"include: ", [t for t in sorted_relations if t[2] >= THRESHOLD])
        print(f"exclude: ", [t for t in sorted_relations if t[2] < THRESHOLD])
        print("\n")
    continue
    break

climate (score=0.570388):
include:  [('change', 470, 1.0), ('warming', 407, 0.87), ('temperature', 270, 0.57), ('carbon', 171, 0.36), ('gas', 158, 0.34), ('year', 155, 0.33), ('greenhouse', 151, 0.32), ('scientist', 145, 0.31), ('dioxide', 138, 0.29), ('emission', 135, 0.29), ('model', 127, 0.27), ('level', 121, 0.26), ('earth', 116, 0.25), ('sea', 104, 0.22), ('evidence', 95, 0.2), ('effect', 91, 0.19), ('surface', 83, 0.18), ('report', 77, 0.16), ('activity', 77, 0.16), ('ice', 76, 0.16), ('century', 76, 0.16), ('decade', 76, 0.16), ('rise', 73, 0.16), ('atmosphere', 71, 0.15), ('study', 70, 0.15), ('weather', 69, 0.15), ('system', 69, 0.15), ('increase', 67, 0.14), ('ipcc', 67, 0.14), ('ocean', 67, 0.14), ('degree', 66, 0.14), ('event', 65, 0.14), ('world', 64, 0.14), ('impact', 58, 0.12), ('period', 58, 0.12), ('consensus', 57, 0.12), ('heat', 56, 0.12)]


nino (score=0.081311):
include:  [('el', 67, 1.0), ('temperature', 36, 0.54), ('year', 28, 0.42), ('ocean', 24, 0.36), ('record

View some of the evidence mappings

In [14]:
MAX_EVIDENCES = 1000

for job in noun_extraction_jobs:
    with open(job["er_path"], mode="r") as f:
        noun_evidences = json.load(f)

    # Maximum evidence count for all nouns
    max_n_evidences = max([len(e_list) for e_list in noun_evidences.values()])
    print("max_n_evidences: ", max_n_evidences)
    
    # Sort evidence quantity by noun in reducing order
    noun_evidence_counts = [(k, len(v)) for k, v in noun_evidences.items()]
    ordered_noun_evidences = [x for x in sorted(
        noun_evidence_counts,
        key=lambda item: item[1], reverse=True
    )]
    
    for query in QUERIES:
        print(f"{query}: ", len(noun_evidences.get(query)))
    
    print(f"include: ", [t for t in ordered_noun_evidences if t[1] <= MAX_EVIDENCES][:200])
    print(f"exclude: ", [t for t in ordered_noun_evidences if t[1] > MAX_EVIDENCES][:200])
    print("\n")
    break

max_n_evidences:  38618
climate:  5888
nino:  215
ecosystem:  1124
warming:  1891
age:  6504
pacific:  2431
oscillation:  169
february:  6405
mars:  166
year:  38618
member:  24759
include:  [('pope', 999), ('subsidiary', 999), ('thailand', 998), ('responsibility', 998), ('piano', 997), ('salt', 997), ('average', 997), ('ceo', 997), ('1st', 996), ('note', 996), ('good', 995), ('com', 995), ('duo', 989), ('travel', 989), ('tokyo', 989), ('reduction', 989), ('houston', 987), ('cinema', 987), ('maine', 986), ('choice', 986), ('hampshire', 985), ('da', 984), ('topic', 984), ('egypt', 983), ('drive', 982), ('budget', 980), ('glass', 980), ('peer', 978), ('intelligence', 978), ('hero', 977), ('lanka', 977), ('americans', 975), ('angel', 975), ('bureau', 973), ('enterprise', 972), ('squad', 971), ('extension', 971), ('physics', 970), ('nova', 970), ('legend', 970), ('grey', 968), ('share', 968), ('jesus', 967), ('lion', 967), ('operator', 966), ('lincoln', 965), ('successor', 965), ('efficien

## Use grid search to find the best thresholds

Objective is to maximise coverage while minimising number of evidences shortlisted. Please run `grid_search_thresholds()` in [ner.py](./src/ner.py).

In [15]:
GRID_SEARCH_RESULT_PATH = ROOT_DIR.joinpath("./result/ner/retrieval_search_results_06_dev.json")

In [16]:
with open(GRID_SEARCH_RESULT_PATH, mode="r") as f:
    retrieval_search_results = json.load(f)

retrieval_results_df = pd.DataFrame(retrieval_search_results)
retrieval_results_df.head(5)

Unnamed: 0,noun_score_threshold,noun_query_ratio_threshold,noun_query_count_threshold,evidence_count_per_noun_threshold,coverage,n_shortlist,avg_shortlist
0,0.001,0.1,1.0,100.0,0.22,25023,616.048951
1,0.053053,0.1,1.0,100.0,0.1,3115,103.846154
2,0.105105,0.1,1.0,100.0,0.09,2694,86.738462
3,0.157158,0.1,1.0,100.0,0.09,2647,76.393443
4,0.209211,0.1,1.0,100.0,0.09,2647,73.966102


In [19]:
retrieval_results_df.query("coverage >= 0.80").sort_values(by=["n_shortlist", "avg_shortlist"], ascending=True)

Unnamed: 0,noun_score_threshold,noun_query_ratio_threshold,noun_query_count_threshold,evidence_count_per_noun_threshold,coverage,n_shortlist,avg_shortlist
1420,0.001000,0.188889,3.2,1200.0,0.81,139918,5602.276316
1400,0.001000,0.100000,3.2,1200.0,0.83,158929,10728.292208
1280,0.001000,0.455556,1.0,1200.0,0.80,160513,3425.936170
2451,0.573579,0.277778,1.0,2300.0,0.91,169485,3029.735099
2452,0.625632,0.277778,1.0,2300.0,0.91,169485,3029.735099
...,...,...,...,...,...,...,...
9620,0.001000,0.188889,1.0,8900.0,1.00,921570,141278.538961
10820,0.001000,0.188889,1.0,10000.0,1.00,941006,148001.740260
8400,0.001000,0.100000,1.0,7800.0,1.00,943839,222213.876623
9600,0.001000,0.100000,1.0,8900.0,1.00,981766,244526.844156


## Retrieve and shortlist evidences for each claim

In [20]:
for job in noun_extraction_jobs:
    
    if not job["cr_path"].exists():
        retrieve_claim_evidence_by_noun(
            claim_path=job["c_path"],
            noun_evidences_path=job["er_path"],
            noun_rel_path=job["nr_path"],
            nlp=nlp,
            save_path=job["cr_path"],
            noun_score_threshold=0.6,
            noun_query_ratio_threshold=0.27,
            noun_query_count_threshold=1,
            evidence_count_per_noun_threshold=2300,
            verbose=True
        )

Retrieve evidence by noun occurrence and relation
Loaded claims n=1228
Loaded noun evidences n=443856
Loaded noun relations n=4234


claims: 100%|██████████| 1228/1228 [00:16<00:00, 73.91it/s]


Creating shortlist of evidences
Saving to /Users/johnsonzhou/git/comp90042-project/result/ner/train_claim_evidence_retrieved.json
Average number of evidences per claim: 2913.480375
Number of shortlisted evidences: 444045
Retrieve evidence by noun occurrence and relation
Loaded claims n=154
Loaded noun evidences n=443856
Loaded noun relations n=4234


claims: 100%|██████████| 154/154 [00:02<00:00, 73.65it/s]


Creating shortlist of evidences
Saving to /Users/johnsonzhou/git/comp90042-project/result/ner/dev_claim_evidence_retrieved.json
Average number of evidences per claim: 3029.735099
Number of shortlisted evidences: 169485
Retrieve evidence by noun occurrence and relation
Loaded claims n=153
Loaded noun evidences n=443856
Loaded noun relations n=4234


claims: 100%|██████████| 153/153 [00:02<00:00, 73.48it/s]


Creating shortlist of evidences
Saving to /Users/johnsonzhou/git/comp90042-project/result/ner/test_claim_evidence_retrieved.json
Average number of evidences per claim: 2641.554795
Number of shortlisted evidences: 149406


## Check matched evidences to see if it includes positives

In [24]:
with open(OUTPUT_PATH.with_name("train_claim_evidence_retrieved.json"), mode="r") as f:
    retrieved_evidences = json.load(f)

In [25]:
train_positives = (
    train_claims
    .groupby("claim")
    .agg(lambda g: list(g.index.get_level_values("evidences")))
    .to_dict()["evidence_text"]
)

Below lists the missing evidences by claim in the training set. This is not essential as long as the overall evidence coverage is good because this list will serve for negative sampling.

In [23]:
for claim, evidences in train_positives.items():
    missing = set(evidences).difference(set(retrieved_evidences.get(claim, [])))
    if len(missing) > 0:
        print(f"{claim}\tmissing: {missing}")
    

claim-1007	missing: {'evidence-338219'}
claim-101	missing: {'evidence-1120350', 'evidence-94670', 'evidence-742898'}
claim-1014	missing: {'evidence-134894', 'evidence-267919'}
claim-1017	missing: {'evidence-1162374'}
claim-102	missing: {'evidence-269919', 'evidence-377576', 'evidence-960067'}
claim-1023	missing: {'evidence-432173'}
claim-103	missing: {'evidence-946255', 'evidence-400014'}
claim-1038	missing: {'evidence-724362', 'evidence-935810', 'evidence-997014', 'evidence-56420', 'evidence-1052470'}
claim-105	missing: {'evidence-423643'}
claim-1050	missing: {'evidence-443550', 'evidence-185758', 'evidence-305543', 'evidence-206841', 'evidence-757137'}
claim-1052	missing: {'evidence-1104838', 'evidence-1136751', 'evidence-1042227'}
claim-1055	missing: {'evidence-742898'}
claim-1066	missing: {'evidence-876634'}
claim-1067	missing: {'evidence-52681'}
claim-1069	missing: {'evidence-67504'}
claim-1075	missing: {'evidence-552239', 'evidence-172951', 'evidence-482300', 'evidence-591257'}
c