# 1. Define evaluators

In [2]:
import re
from collections import Counter
from typing import List, Dict, Tuple

def normalize(s: str) -> str:
    """Lowercase, replace non-alnum with spaces, collapse whitespace, strip."""
    if s is None: 
        return ""
    s = s.lower()
    s = re.sub(r'[^0-9a-z\u00C0-\u017F]+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenize(s: str) -> List[str]:
    return normalize(s).split() if s else []

class Evaluator:
    def __init__(self, gold: Dict[str, List[str]], pred: Dict[str, List[str]]):
        """
        gold, pred: dict mapping doc_id -> list of term strings
        """
        self.gold = {d: list(gold.get(d, [])) for d in gold}
        self.pred = {d: list(pred.get(d, [])) for d in pred}
        for d in set(list(self.gold.keys()) + list(self.pred.keys())):
            self.gold.setdefault(d, [])
            self.pred.setdefault(d, [])

    # ---- Exact-match metrics ----
    def exact(self) -> Tuple[float,float,float, Dict]:
        tp = fp = fn = 0
        details = {}
        for d in sorted(self.gold.keys()):
            gset = Counter([normalize(t) for t in self.gold[d]])
            pset = Counter([normalize(t) for t in self.pred[d]])
            doc_tp = sum(min(gset[t], pset[t]) for t in set(gset) | set(pset))
            doc_fp = sum(pset.values()) - doc_tp
            doc_fn = sum(gset.values()) - doc_tp
            tp += doc_tp; fp += doc_fp; fn += doc_fn
            details[d] = {'tp':doc_tp,'fp':doc_fp,'fn':doc_fn,
                          'gold':sum(gset.values()), 'pred':sum(pset.values())}
        p = tp/(tp+fp) if (tp+fp) else 0
        r = tp/(tp+fn) if (tp+fn) else 0
        f1 = (2*p*r/(p+r)) if (p+r) else 0
        return p,r,f1,details

    # ---- Substring match ----
    def substring(self) -> Tuple[float,float,float, Dict]:
        tp = fp = fn = 0
        details = {}
        for d in sorted(self.gold.keys()):
            gold_list = [normalize(t) for t in self.gold[d]]
            pred_list = [normalize(t) for t in self.pred[d]]
            used_gold = [False]*len(gold_list)
            doc_tp = 0
            for p in pred_list:
                matched = False
                for i,g in enumerate(gold_list):
                    if used_gold[i]: 
                        continue
                    if p == g or (' '+p+' ') in (' '+g+' ') or (' '+g+' ') in (' '+p+' '):
                        matched = True
                        used_gold[i] = True
                        break
                    if p in g or g in p:
                        matched = True
                        used_gold[i] = True
                        break
                if matched:
                    doc_tp += 1
            doc_fp = len(pred_list) - doc_tp
            doc_fn = len(gold_list) - doc_tp
            tp += doc_tp; fp += doc_fp; fn += doc_fn
            details[d] = {'tp':doc_tp,'fp':doc_fp,'fn':doc_fn,
                          'gold':len(gold_list), 'pred':len(pred_list)}
        p = tp/(tp+fp) if (tp+fp) else 0
        r = tp/(tp+fn) if (tp+fn) else 0
        f1 = (2*p*r/(p+r)) if (p+r) else 0
        return p,r,f1,details

    # ---- IoU token-overlap ----
    def iou_match(self, iou_threshold=0.5) -> Tuple[float,float,float, Dict]:
        tp = fp = fn = 0
        details = {}
        for d in sorted(self.gold.keys()):
            gold_tokens = [set(tokenize(g)) for g in self.gold[d]]
            pred_tokens = [set(tokenize(p)) for p in self.pred[d]]
            used_gold = [False]*len(gold_tokens)
            doc_tp = 0
            for p_tok in pred_tokens:
                best_i, best_iou = None, 0
                for i,g_tok in enumerate(gold_tokens):
                    if used_gold[i]:
                        continue
                    inter = len(p_tok & g_tok)
                    union = len(p_tok | g_tok)
                    iou = inter/union if union else 0
                    if iou > best_iou:
                        best_iou, best_i = iou, i
                if best_i is not None and best_iou >= iou_threshold:
                    doc_tp += 1
                    used_gold[best_i] = True
            doc_fp = len(pred_tokens) - doc_tp
            doc_fn = len(gold_tokens) - doc_tp
            tp += doc_tp; fp += doc_fp; fn += doc_fn
            details[d] = {'tp':doc_tp,'fp':doc_fp,'fn':doc_fn,
                          'gold':len(gold_tokens), 'pred':len(pred_tokens),
                          'iou_threshold': iou_threshold}
        p = tp/(tp+fp) if (tp+fp) else 0
        r = tp/(tp+fn) if (tp+fn) else 0
        f1 = (2*p*r/(p+r)) if (p+r) else 0
        return p,r,f1,details

    # ---- Token-level metrics ----
    def token_level(self) -> Tuple[float,float,float, Dict]:
        tp = fp = fn = 0
        details = {}
        for d in sorted(self.gold.keys()):
            gcount = Counter([tok for g in self.gold[d] for tok in tokenize(g)])
            pcount = Counter([tok for p in self.pred[d] for tok in tokenize(p)])
            doc_tp = sum(min(gcount[t], pcount[t]) for t in set(gcount) | set(pcount))
            doc_fp = sum(pcount.values()) - doc_tp
            doc_fn = sum(gcount.values()) - doc_tp
            tp += doc_tp; fp += doc_fp; fn += doc_fn
            details[d] = {'tp':doc_tp,'fp':doc_fp,'fn':doc_fn,
                          'gold_tokens':sum(gcount.values()),
                          'pred_tokens':sum(pcount.values())}
        p = tp/(tp+fp) if (tp+fp) else 0
        r = tp/(tp+fn) if (tp+fn) else 0
        f1 = (2*p*r/(p+r)) if (p+r) else 0
        return p,r,f1,details

In [3]:
path = '/home/oarga/saccrow-data/papers/benchmark_papers_all'

docs_pred = {}
docs_gold = {}
texts = {}

# list all folders in path
import os
import csv

for f in os.listdir(path):
    if not os.path.isdir(os.path.join(path, f)):
        continue
    pred_file = 'paper/plain/processed/abstract.csv'
    gold_file = 'paper/plain/processed/gt_abstract.txt'
    text_file = 'paper/plain/processed/abstract.txt'
    pred_path = os.path.join(path, f, pred_file)
    gold_path = os.path.join(path, f, gold_file)
    text_path = os.path.join(path, f, text_file)
    # read each row of txt to list
    with open(gold_path, 'r') as inf:
        docs_gold[f] = [line.strip() for line in inf if line.strip()]
    # read first col of each row as csv
    with open(pred_path, 'r') as inf:
        reader = csv.reader(inf, delimiter=',')
        docs_pred[f] = [row[0].strip() for row in reader if row and row[0].strip()]    
    with open(text_path, 'r') as inf:
        texts[f] = inf.read()

In [None]:
exact_results = {}
substring_results = {}
iou_results = {}
token_results = {}

for dd in [exact_results, substring_results, iou_results, token_results]:
    dd['p_list'] = []
    dd['r_list'] = []
    dd['f1_list'] = []
    dd['tp_list'] = []
    dd['fp_list'] = []
    dd['fn_list'] = []
    dd['gold_list'] = []
    dd['pred_list'] = []

def add_results(ddict, results_item):
    p, r, f1, details = results_item
    ddict['p_list'].append(p)
    ddict['r_list'].append(r)
    ddict['f1_list'].append(f1)
    for d in details.values():
        ddict['tp_list'].append(d['tp'])
        ddict['fp_list'].append(d['fp'])
        ddict['fn_list'].append(d['fn'])
        ddict['gold_list'].append(d.get('gold', d.get('gold_tokens', 0)))
        ddict['pred_list'].append(d.get('pred', d.get('pred_tokens', 0)))

for k in docs_gold.keys():
    print(k)
    gold = {
        k: docs_gold[k]
    }
    pred = {
        k: docs_pred[k]
    }

    ev = Evaluator(gold, pred)
    ee = ev.exact()
    print("    === Exact-match ===", ee)
    add_results(exact_results, ee)

    ee = ev.substring()
    print("    === Substring-match ===", )
    add_results(substring_results, ee)

    ee = ev.iou_match(0.5)
    print("    === IoU token-overlap (0.5) ===", ee)
    add_results(iou_results, ee)

    ee = ev.token_level()
    print("    === Token-level ===", ee)
    add_results(token_results, ee)
    
    print()

for k, v in exact_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Exact-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in substring_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Substring-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in iou_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"IoU-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in token_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Token-{k}: {mean:.4f} ± {std:.4f}")
print()

In [8]:
path = '/home/oarga/saccrow-data/papers/benchmark_papers_all'

docs_pred = {}
docs_gold = {}
texts = {}

# list all folders in path
import os
import csv

for f in os.listdir(path):
    if not os.path.isdir(os.path.join(path, f)):
        continue
    pred_file = 'paper/plain/processed/abstract_nosplit.csv'
    gold_file = 'paper/plain/processed/gt_abstract.txt'
    text_file = 'paper/plain/processed/abstract.txt'
    pred_path = os.path.join(path, f, pred_file)
    gold_path = os.path.join(path, f, gold_file)
    text_path = os.path.join(path, f, text_file)
    # read each row of txt to list
    with open(gold_path, 'r') as inf:
        docs_gold[f] = [line.strip() for line in inf if line.strip()]
    # read first col of each row as csv
    with open(pred_path, 'r') as inf:
        reader = csv.reader(inf, delimiter=',')
        docs_pred[f] = [row[0].strip() for row in reader if row and row[0].strip()]    
    with open(text_path, 'r') as inf:
        texts[f] = inf.read()

In [None]:
exact_results = {}
substring_results = {}
iou_results = {}
token_results = {}

for dd in [exact_results, substring_results, iou_results, token_results]:
    dd['p_list'] = []
    dd['r_list'] = []
    dd['f1_list'] = []
    dd['tp_list'] = []
    dd['fp_list'] = []
    dd['fn_list'] = []
    dd['gold_list'] = []
    dd['pred_list'] = []

def add_results(ddict, results_item):
    p, r, f1, details = results_item
    ddict['p_list'].append(p)
    ddict['r_list'].append(r)
    ddict['f1_list'].append(f1)
    for d in details.values():
        ddict['tp_list'].append(d['tp'])
        ddict['fp_list'].append(d['fp'])
        ddict['fn_list'].append(d['fn'])
        ddict['gold_list'].append(d.get('gold', d.get('gold_tokens', 0)))
        ddict['pred_list'].append(d.get('pred', d.get('pred_tokens', 0)))

for k in docs_gold.keys():
    print(k)
    gold = {
        k: docs_gold[k]
    }
    pred = {
        k: docs_pred[k]
    }

    ev = Evaluator(gold, pred)
    ee = ev.exact()
    print("    === Exact-match ===", ee)
    add_results(exact_results, ee)

    ee = ev.substring()
    print("    === Substring-match ===", )
    add_results(substring_results, ee)

    ee = ev.iou_match(0.5)
    print("    === IoU token-overlap (0.5) ===", ee)
    add_results(iou_results, ee)

    ee = ev.token_level()
    print("    === Token-level ===", ee)
    add_results(token_results, ee)
    
    print()

for k, v in exact_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Exact-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in substring_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Substring-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in iou_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"IoU-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in token_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Token-{k}: {mean:.4f} ± {std:.4f}")
print()

In [10]:
path = '/home/oarga/saccrow-data/papers/benchmark_papers_all'

docs_pred = {}
docs_gold = {}
texts = {}

# list all folders in path
import os
import csv

for f in os.listdir(path):
    if not os.path.isdir(os.path.join(path, f)):
        continue
    pred_file = 'paper/plain/processed/abstract_nofilter.csv'
    gold_file = 'paper/plain/processed/gt_abstract.txt'
    text_file = 'paper/plain/processed/abstract.txt'
    pred_path = os.path.join(path, f, pred_file)
    gold_path = os.path.join(path, f, gold_file)
    text_path = os.path.join(path, f, text_file)
    # read each row of txt to list
    with open(gold_path, 'r') as inf:
        docs_gold[f] = [line.strip() for line in inf if line.strip()]
    # read first col of each row as csv
    with open(pred_path, 'r') as inf:
        reader = csv.reader(inf, delimiter=',')
        docs_pred[f] = [row[0].strip() for row in reader if row and row[0].strip()]    
    with open(text_path, 'r') as inf:
        texts[f] = inf.read()

In [None]:
exact_results = {}
substring_results = {}
iou_results = {}
token_results = {}

for dd in [exact_results, substring_results, iou_results, token_results]:
    dd['p_list'] = []
    dd['r_list'] = []
    dd['f1_list'] = []
    dd['tp_list'] = []
    dd['fp_list'] = []
    dd['fn_list'] = []
    dd['gold_list'] = []
    dd['pred_list'] = []

def add_results(ddict, results_item):
    p, r, f1, details = results_item
    ddict['p_list'].append(p)
    ddict['r_list'].append(r)
    ddict['f1_list'].append(f1)
    for d in details.values():
        ddict['tp_list'].append(d['tp'])
        ddict['fp_list'].append(d['fp'])
        ddict['fn_list'].append(d['fn'])
        ddict['gold_list'].append(d.get('gold', d.get('gold_tokens', 0)))
        ddict['pred_list'].append(d.get('pred', d.get('pred_tokens', 0)))

for k in docs_gold.keys():
    print(k)
    gold = {
        k: docs_gold[k]
    }
    pred = {
        k: docs_pred[k]
    }

    ev = Evaluator(gold, pred)
    ee = ev.exact()
    print("    === Exact-match ===", ee)
    add_results(exact_results, ee)

    ee = ev.substring()
    print("    === Substring-match ===", )
    add_results(substring_results, ee)

    ee = ev.iou_match(0.5)
    print("    === IoU token-overlap (0.5) ===", ee)
    add_results(iou_results, ee)

    ee = ev.token_level()
    print("    === Token-level ===", ee)
    add_results(token_results, ee)
    
    print()

for k, v in exact_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Exact-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in substring_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Substring-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in iou_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"IoU-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in token_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Token-{k}: {mean:.4f} ± {std:.4f}")
print()

# 2. Evaluate C/NC

In [None]:
import spacy
from collections import defaultdict, Counter
import math

# Load spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# ----------------------------
# Step 1: Candidate term extraction (noun phrases)
# ----------------------------
def extract_candidates(doc):
    candidates = []
    for chunk in doc.noun_chunks:
        term = chunk.text.lower().strip()
        if len(term.split()) >= 1:  # include both single and multi-word terms
            candidates.append(term)
    return candidates

# ----------------------------
# Step 2: Compute C-value
# ----------------------------
def compute_c_values(corpus_candidates):
    freq = Counter()
    for cands in corpus_candidates:
        freq.update(cands)

    nested_map = defaultdict(list)

    # Find nesting relations
    for a in freq:
        for b in freq:
            if a != b and a in b:
                nested_map[a].append(b)

    c_values = {}
    for term in freq:
        term_len = len(term.split())
        f_a = freq[term]
        if term in nested_map:
            t_a = nested_map[term]
            nested_freq = sum(freq[b] for b in t_a) / len(t_a)
            c_val = math.log2(term_len + 1) * (f_a - nested_freq)  # +1 to handle single words
        else:
            c_val = math.log2(term_len + 1) * f_a
        c_values[term] = c_val
    return c_values

# ----------------------------
# Step 3: Compute NC-value
# ----------------------------
def compute_nc_values(corpus, c_values, alpha=0.8):
    context_words = defaultdict(int)

    # Collect context words from all docs
    for doc in corpus:
        for term in c_values:
            if term in doc.text.lower():
                tokens = doc.text.lower().split()
                for i, tok in enumerate(tokens):
                    if tok in term.split():
                        if i > 0:
                            context_words[tokens[i-1]] += 1
                        if i < len(tokens)-1:
                            context_words[tokens[i+1]] += 1

    # Normalize context weights
    total = sum(context_words.values())
    weights = {w: c/total for w, c in context_words.items() if total > 0}

    # Compute NC-values
    nc_values = {}
    for term, c_val in c_values.items():
        ctx_score = 0
        for w in term.split():
            if w in weights:
                ctx_score += weights[w]
        nc_values[term] = alpha * c_val + (1-alpha) * ctx_score

    return nc_values

# ----------------------------
# Step 4: Ranking and Filtering
# ----------------------------
def rank_terms(nc_values, top_n=None, min_score=None):
    ranked_terms = sorted(nc_values.items(), key=lambda x: x[1], reverse=True)

    if min_score is not None:
        ranked_terms = [(t, s) for t, s in ranked_terms if s >= min_score]

    if top_n is not None:
        ranked_terms = ranked_terms[:top_n]

    return ranked_terms

# ----------------------------
# Example Usage with Multiple Documents
# ----------------------------
corpus_texts = list(texts.values())

# Process corpus
docs = [nlp(text) for text in corpus_texts]

# Extract candidates from each doc
corpus_candidates = [extract_candidates(doc) for doc in docs]

# Compute C and NC values
c_values = compute_c_values(corpus_candidates)
nc_values = compute_nc_values(docs, c_values)

# Rank terms with both options
#print("Top 10 terms:")
#for term, score in rank_terms(nc_values, top_n=10):
#    print(f"{term} -> {score:.4f}")

print("\nTerms with NC-value >= 1.0:")
selected_terms = []
for term, score in rank_terms(nc_values, min_score=1.0):
    print(f"{term} -> {score:.4f}")
    selected_terms.append(term)


In [13]:
def clean_term(term):
    term = term.strip().lower()
    if term.startswith('the '):
        term = term[4:]
    if term.endswith('this '):
        term = term[:-5]
    if term.endswith('a '):
        term = term[:-2]
    if term.endswith('an '):
        term = term[:-3]
    return term.strip()

for i in range(len(selected_terms)):
    selected_terms[i] = clean_term(selected_terms[i])

docs_pred = {}
for f in texts:
    docs_pred[f] = set()
    for term in selected_terms:
        if term in texts[f].lower():
            docs_pred[f].add(term)
    docs_pred[f] = list(docs_pred[f])

In [None]:
exact_results = {}
substring_results = {}
iou_results = {}
token_results = {}

for dd in [exact_results, substring_results, iou_results, token_results]:
    dd['p_list'] = []
    dd['r_list'] = []
    dd['f1_list'] = []
    dd['tp_list'] = []
    dd['fp_list'] = []
    dd['fn_list'] = []
    dd['gold_list'] = []
    dd['pred_list'] = []

def add_results(ddict, results_item):
    p, r, f1, details = results_item
    ddict['p_list'].append(p)
    ddict['r_list'].append(r)
    ddict['f1_list'].append(f1)
    for d in details.values():
        ddict['tp_list'].append(d['tp'])
        ddict['fp_list'].append(d['fp'])
        ddict['fn_list'].append(d['fn'])
        ddict['gold_list'].append(d.get('gold', d.get('gold_tokens', 0)))
        ddict['pred_list'].append(d.get('pred', d.get('pred_tokens', 0)))

for k in docs_gold.keys():
    print(k)
    gold = {
        k: docs_gold[k]
    }
    pred = {
        k: docs_pred[k]
    }

    ev = Evaluator(gold, pred)
    ee = ev.exact()
    print("    === Exact-match ===", ee)
    add_results(exact_results, ee)

    ee = ev.substring()
    print("    === Substring-match ===", )
    add_results(substring_results, ee)

    ee = ev.iou_match(0.5)
    print("    === IoU token-overlap (0.5) ===", ee)
    add_results(iou_results, ee)

    ee = ev.token_level()
    print("    === Token-level ===", ee)
    add_results(token_results, ee)
    
    print()

for k, v in exact_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Exact-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in substring_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Substring-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in iou_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"IoU-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in token_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Token-{k}: {mean:.4f} ± {std:.4f}")
print()

# 3 Evaluate ChemBERT

In [None]:
from keybert import KeyBERT

def extract_keywords_from_corpus(corpus, ngram_range=(1, 3), stop_words='english', top_n=100, min_score=0.3):
    """
    Extract keywords from a corpus of texts.
    
    Parameters:
        corpus (list[str]): List of documents (strings).
        ngram_range (tuple): Size of n-grams to extract (default: (1,3)).
        stop_words (str or list): Stop words to remove (default: 'english').
        top_n (int): Number of top candidates to return per document (default: 100).
        min_score (float): Minimum score threshold for keeping a keyword (default: 0.3).
    
    Returns:
        list[str]: A list of unique keywords across the corpus.
    """
    kw_model = KeyBERT()
    all_keywords = set()  # use set to avoid duplicates
    
    for text in corpus:
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=ngram_range,
            stop_words=stop_words,
            top_n=top_n
        )
        # Filter by score
        filtered = [kw for kw, score in keywords if score > min_score]
        all_keywords.update(filtered)  # add to set
    
    return list(all_keywords)


docs_pred = {}
for k, t in texts.items():
    docs_pred[k] = extract_keywords_from_corpus([t])

docs_pred

In [None]:
exact_results = {}
substring_results = {}
iou_results = {}
token_results = {}

for dd in [exact_results, substring_results, iou_results, token_results]:
    dd['p_list'] = []
    dd['r_list'] = []
    dd['f1_list'] = []
    dd['tp_list'] = []
    dd['fp_list'] = []
    dd['fn_list'] = []
    dd['gold_list'] = []
    dd['pred_list'] = []

def add_results(ddict, results_item):
    p, r, f1, details = results_item
    ddict['p_list'].append(p)
    ddict['r_list'].append(r)
    ddict['f1_list'].append(f1)
    for d in details.values():
        ddict['tp_list'].append(d['tp'])
        ddict['fp_list'].append(d['fp'])
        ddict['fn_list'].append(d['fn'])
        ddict['gold_list'].append(d.get('gold', d.get('gold_tokens', 0)))
        ddict['pred_list'].append(d.get('pred', d.get('pred_tokens', 0)))

for k in docs_gold.keys():
    print(k)
    gold = {
        k: docs_gold[k]
    }
    pred = {
        k: docs_pred[k]
    }

    ev = Evaluator(gold, pred)
    ee = ev.exact()
    print("    === Exact-match ===", ee)
    add_results(exact_results, ee)

    ee = ev.substring()
    print("    === Substring-match ===", )
    add_results(substring_results, ee)

    ee = ev.iou_match(0.5)
    print("    === IoU token-overlap (0.5) ===", ee)
    add_results(iou_results, ee)

    ee = ev.token_level()
    print("    === Token-level ===", ee)
    add_results(token_results, ee)
    
    print()

for k, v in exact_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Exact-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in substring_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Substring-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in iou_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"IoU-{k}: {mean:.4f} ± {std:.4f}")
print()

for k, v in token_results.items():
    import numpy as np
    mean = np.array(v).mean()
    std = np.array(v).std()
    print(f"Token-{k}: {mean:.4f} ± {std:.4f}")
print()