# Evaluating ElasticSearch BM-25 top-k accuracy documents & baseline-redacted documents

First, we need to connect to ElasticSearch and add all the profiles (as strings) to indexes:

In [1]:
from elasticsearch import Elasticsearch

username = "elastic"
password = "FjZD_LI-=AJOtsfpq9U*"

url = f"https://elastic:{password}@rush-compute-01.tech.cornell.edu:9200"

es = Elasticsearch(
    url,
    # use_ssl = True,
    # ca_certs=False,
    verify_certs=False
)

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)



In [2]:
es

<Elasticsearch([{'host': 'rush-compute-01.tech.cornell.edu', 'port': 9200, 'use_ssl': True, 'http_auth': 'elastic:FjZD_LI-=AJOtsfpq9U*'}])>

In [3]:
# Delete an existing index
# es.indices.delete(index='val_5_profile_str', ignore=[400, 404])
# for idx in [idx for idx in es.indices.get_alias().keys() if not idx.startswith('.')]:
#     print('deleting', idx)
#     es.indices.delete(index=idx, ignore=[400, 404])

In [4]:
import datasets

from elasticsearch import helpers
from elasticsearch_dsl import Index


def create_index_from_profiles(index_name: str, dataset_split: str, b: float = 0.9, k1: float = 4.5):
    index = Index(index_name, es)
    index.settings(
        number_of_shards=1, # need one shard since scores are calculated with a single shard!
        number_of_replicas=2,
        index={
            'mapping': {
                'ignore_malformed': True,
                'total_fields.limit': 20_000
            },
            "similarity" : {
              "default" : {
                "type" : "BM25",
                "b": b,
                "k1": k1
              }
            }
        }
    )
    index.create()
    

    dataset = datasets.load_dataset('wiki_bio', split=dataset_split, version='1.2.0')

    def make_prof_table(prof):
        table = prof['input_text']['table']
        prof_dict = dict(zip(table['column_header'], table['content']))
        prof_dict = { k.strip().strip('.|<>'): v.strip().strip('.|<>') for k,v in prof_dict.items() }
        if 'no.of.children' in prof_dict:
            # fix for one weird error
            prof_dict['no of children'] = prof_dict['no.of.children']
            del prof_dict['no.of.children']
        prof_dict = {k: v for k,v in prof_dict.items() if (len(k) and len(v))}
        prof_str = ''
        for k,v in prof_dict.items():
            prof_str += f'{k} : {v}'
            prof_str += '\n'
        return prof_str

    prof_data = [make_prof_table(prof) for prof in dataset]

    print('inserting', len(prof_data), 'profiles')

    prof_data_json = [{'_id': idx, 'body': { 'profile': profile_str, 'id': idx }} for idx, profile_str in enumerate(prof_data)]
    return helpers.bulk(es, prof_data_json, index=index_name)

In [5]:
# create_index_from_profiles('val_100_profile_str', 'val[:100%]')
# create_index_from_profiles('test_100_profile_str', 'test[:100%]')
# create_index_from_profiles('train_100_profile_str', 'train[:100%]')

Now that the indices are created, we can iterate over documents and compute the top-K accuracy.

In [6]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

import os
from dataloader import WikipediaDataModule

num_cpus = len(os.sched_getaffinity(0))

dm = WikipediaDataModule(
    document_model_name_or_path = 'roberta-base',
    profile_model_name_or_path = 'google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:100%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split test[:100%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_wiki.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_val____100_____1.2.0__roberta-base_wiki.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_test____100_____1.2.0__roberta-base_wiki.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80

  0%|          | 0/73 [00:00<?, ?ba/s]



  0%|          | 0/72831 [00:00<?, ?ex/s]

  0%|          | 0/72831 [00:00<?, ?ex/s]

  0%|          | 0/72831 [00:00<?, ?ex/s]

  0%|          | 0/72831 [00:00<?, ?ex/s]

 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00000_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00001_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00002_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00003_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00004_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00005_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00006_of_00008.arrow


 

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train____100_____1.2.0__roberta-base_128_google_tapas-base_tokenized_00007_of_00008.arrow


          

#1:   0%|          | 0/9104 [00:00<?, ?ex/s]

#0:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/9103 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/9104 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/9103 [00:00<?, ?ex/s]

In [14]:
import re

def preprocess_doc(doc: str) -> str:
    # limit 500 words
    doc = ' '.join(doc.split(' ')[:500])
    # fix braces and remove weird characters
    doc = doc.replace('-lrb-', '(').replace('-rrb-', ')')
    return re.sub(r'[^\w|\s]', ' ',doc)

def search_results_for_query_by_index(query: str, index: str, max_hits: int = 10):
    # print(query)
    search_results = es.search(index=index, q=query, size=max_hits)
    num_hits = search_results["hits"]["total"]["value"]
    # print("got", num_hits, "hits")
    return num_hits, search_results["hits"]["hits"]

def index_of_doc_id_in_results_list(doc: str, doc_id: int, max_hits=100):
    """Searches for test doc in all three indices. Returns index of doc in results if found."""
    _, results = search_results_for_query_by_index(
        query=preprocess_doc(doc),
        index="val_100_profile_str,test_100_profile_str,train_100_profile_str",
        max_hits=max_hits
    )
    results_from_test_set = [
        (idx, result) for (idx, result) in enumerate(results) if result['_index'] == 'test_100_profile_str'
    ]
    # print(len(results_from_test_set), "results from test set")
    
    for result_idx, result in enumerate(results):
        if (result['_index'] == 'test_100_profile_str') and (int(result['_id']) == doc_id):
            return result_idx
    return float('inf')

for i in range(4):
    print(index_of_doc_id_in_results_list(dm.test_dataset[i]['document'], i))

81
inf
0
inf


In [17]:
import collections
import tqdm

k_values = [1, 10, 100]
total_correct_by_k_doc = collections.defaultdict(lambda: 0)
total = 1000
for j in tqdm.trange(total):
    result_idx = index_of_doc_id_in_results_list(dm.test_dataset[j]['document'], j)
    for k in k_values:
        if result_idx < k: total_correct_by_k_doc[k] += 1


for k in k_values:
    acc = total_correct_by_k_doc[k] / total
    acc_str = f'Top-{k} accuracy = {acc*100.0:.2f}'
    print(acc_str)

100%|██████████| 1000/1000 [02:56<00:00,  5.67it/s]

Top-1 accuracy = 69.00
Top-10 accuracy = 83.40
Top-100 accuracy = 92.60





In [26]:
total_correct_by_k_lex = collections.defaultdict(lambda: 0)
total = 1000
for j in tqdm.trange(total):
    result_idx = index_of_doc_id_in_results_list(dm.test_dataset[j]['document_redact_lexical'], j)
    for k in k_values:
        if result_idx < k: total_correct_by_k_lex[k] += 1

        
for k in k_values:
    acc = total_correct_by_k_lex[k] / total
    acc_str = f'Top-{k} accuracy = {acc*100.0:.2f}'
    print(acc_str)

100%|██████████| 1000/1000 [01:58<00:00,  8.44it/s]

Top-1 accuracy = 0.00
Top-10 accuracy = 0.10
Top-100 accuracy = 0.10





In [27]:
total_correct_by_k_ner = collections.defaultdict(lambda: 0)
total = 1000
for j in tqdm.trange(total):
    result_idx = index_of_doc_id_in_results_list(dm.test_dataset[j]['document_redact_ner_bert'], j)
    for k in k_values:
        if result_idx < k: total_correct_by_k_ner[k] += 1

        
for k in k_values:
    acc = total_correct_by_k_ner[k] / total
    acc_str = f'Top-{k} accuracy = {acc*100.0:.2f}'
    print(acc_str)

100%|██████████| 1000/1000 [02:29<00:00,  6.67it/s]

Top-1 accuracy = 0.10
Top-10 accuracy = 0.60
Top-100 accuracy = 11.90



