working out what the structure of the query to elasticsearch should be, and how the results need to be processed to consistently extract spans for concepts

In [None]:
import json
from pathlib import Path

from elasticsearch import Elasticsearch

from src.concept import Concept
from src.document import Document, Span

In [None]:
data_dir = Path("../data")
raw_text_dir = data_dir / "raw" / "text"

document_files = list(raw_text_dir.glob("*.json"))
documents = [Document.load_raw(file, parse=False) for file in document_files]

with open(data_dir / "raw" / "concepts.json") as f:
    concepts_data = json.load(f)

concepts = [Concept.from_dict(concept) for concept in concepts_data]

In [None]:
document = documents[0]
concept = concepts[0]
es_client = Elasticsearch()
index_name = "documents"

In [None]:
search_terms = "Employment Tribunal"
es_client.search(
    index=index_name,
    query={
        "bool": {
            "must": [
                {"ids": {"values": [document.id]}},
                {"match": {"text": search_terms}},
            ]
        }
    },
    highlight={"fields": {"text": {}}},
    size=1,
)

In [None]:
spans = []
search_terms = "Employment Tribunal"
results = es_client.search(
    index=index_name,
    query={
        "bool": {
            "must": [
                {"ids": {"values": [document.id]}},
                {"match_phrase": {"text": search_terms}},
            ]
        }
    },
    highlight={
        "fields": {"text": {}},
        "number_of_fragments": 0,
    },
    size=1,
)

for hit in results["hits"]["hits"]:  # account for zero hits
    text = hit.get("highlight", {}).get("text", [""])[0]
    while "<em>" in text and "</em>" in text:
        start_index = text.find("<em>")
        text = text.replace("<em>", "", 1)

        end_index = text.find("</em>")
        text = text.replace("</em>", "", 1)

        spans.append(
            Span(
                start_index=start_index,
                end_index=end_index,
                identifier=concept.id,
            )
        )

In [None]:
concept_id_to_label = {concept.id: concept.preferred_label for concept in concepts}

for span in spans:
    print(
        document.text[span.start_index : span.end_index],
        concept_id_to_label[span.identifier],
    )