# Imports

In [5]:
import regex
import pandas as pd

# Baseline Evaluation
In order to contextualize the performance of the baselines (general and specific), we want to check how often the key search (with the given fuzziness) yields any result. In case of the general baseline, if no match is found then no key-value is extracted, in case of the specific baseline there is the addition of synonyms for some of the keys which increases the likelihood of finding a key in the doc.

In [6]:
in_df = pd.read_csv("datasets/kleister_charity/dev-0/in_extended.tsv", sep="\t")

In [7]:
prompt_key_to_gold_key = {
    "Address (post code)": "address__postcode",
    "Address (street)": "address__street_line",
    "Address (post town)": "address__post_town",
    "Charity Name": "charity_name",
    "Charity Number": "charity_number",
    "Annual Income": "income_annually_in_british_pounds",
    "Period End Date": "report_date",
    "Annual Spending": "spending_annually_in_british_pounds",
}
prompt_keys = list(prompt_key_to_gold_key.keys())
gold_keys = list(prompt_key_to_gold_key.values())

# for specific baseline we also use these synonyms
synonyms = {
    "Charity Name": ["Charity Name"],
    "Charity Number": [
        "Charity Number",
        "Charity Registration No",
        "Charity No",
    ],
    "Annual Income": ["Annual Income", "Income", "Total Income"],
    "Period End Date": ["Period End Date", "Period End", "Year Ended"],
    "Annual Spending": [
        "Annual Spending",
        "Spending",
        "Total Spending",
        "Expenditure",
    ],
}

# create dict that will record how often the respective key was actually found
key_to_count = {key: 0 for key in prompt_keys}

total = 0

In [8]:
# NOTE: this has to match the respective configuration of the baseline model that is evaluated
type_of_baseline = "specific" # or "specific" 
error_percentage = 0.18

def get_best_match_span(text: str, key: str):
    """
    Returns the best match for the key in the text with some fuzziness
    (i.e. we limit the levenshtein distance) of the best match.

    (?b) -> BESTMATCH
    (?i) -> IGNORECASE
    {e<n} -> up to n errors (subs, inserts, dels). if more -> None
    (1) -> the span of the best match
    """
    key_length = len(key)
    max_errors = round(key_length * error_percentage)
    match_span = regex.search(f"(?b)(?i)({key}){{e<{max_errors}}}", text)

    if match_span:
        return match_span.span(1)

In [9]:
for i in range(len(in_df)):
    text = in_df.loc[in_df["filename"] == in_df.iloc[i]["filename"], "text_best_cleaned"].values[0]

    if type_of_baseline == "general":
        # check for a match for each key
        for i, key in enumerate(prompt_keys):
            total += 1
            match_span = get_best_match_span(text, key)

            if match_span is None:
                continue
            else:
                key_to_count[key] += 1
    
    elif type_of_baseline == "specific":
        # check for a match for each synonym
        for i, key in enumerate(list(synonyms.keys())):
            total += 1
            for synonym in synonyms[key]:
                match_span = get_best_match_span(text, synonym)
    
                if match_span is None: # no match for this synonym
                    continue
                else:
                    key_to_count[key] += 1 # found a match for this key
                    break # no need to check the other synonyms for this key as we are only interested if any of them matches


In [10]:
if type_of_baseline == "specific":
    num_keys_considered = len(synonyms.keys())

    # remove the keys that we don't use synonyms for
    for key in prompt_keys:
        if key not in synonyms.keys():
            key_to_count.pop(key)
else:
    num_keys_considered = len(prompt_keys)


key_to_count_percentage = {key: count / total * num_keys_considered * 100 for key, count in key_to_count.items()}
print(key_to_count_percentage)

# macro average
print(f"(macro)[over keys] average Percentage of how often the keys can be found in the docoument: {sum(key_to_count_percentage.values()) / len(key_to_count_percentage)}")

# micro average
print(f"(micro)[over all predictions] average Percentage of how often the keys can be found in the document: {sum(key_to_count.values()) / total * 100}")

# they are the same which makes sense because all classes (keys) are checked the same amount of times

{'Charity Name': 22.499999999999996, 'Charity Number': 97.72727272727273, 'Annual Income': 97.72727272727273, 'Period End Date': 97.50000000000001, 'Annual Spending': 94.0909090909091}
(macro)[over keys] average Percentage of how often the keys can be found in the docoument: 81.90909090909092
(micro)[over all predictions] average Percentage of how often the keys can be found in the document: 81.9090909090909
