# Imports

In [119]:
import regex
import pandas as pd
import numpy as np

import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from Levenshtein import distance
from collections import Counter

# Constants

In [120]:
# NOTE: this has to match the respective configuration of the baseline model that is evaluated
TYPE_OF_BASELINE = "GENERAL" # "SPECIFIC" or "GENERAL"
ERROR_PERCENTAGE = 0.18

# Loading OCR input

In [121]:
in_df = pd.read_csv("datasets/kleister_charity/dev-0/in_extended.tsv", sep="\t")

# Baseline Evaluation
In order to contextualize the performance of the baselines (general and specific), we want to check how often the key search (with the given fuzziness) yields any result. In case of the general baseline, if no match is found then no key-value is extracted, in case of the specific baseline there is the addition of synonyms for some of the keys which increases the likelihood of finding a key in the doc.

In [122]:
prompt_key_to_gold_key = {
    "Address (post code)": "address__postcode",
    "Address (street)": "address__street_line",
    "Address (post town)": "address__post_town",
    "Charity Name": "charity_name",
    "Charity Number": "charity_number",
    "Annual Income": "income_annually_in_british_pounds",
    "Period End Date": "report_date",
    "Annual Spending": "spending_annually_in_british_pounds",
}
prompt_keys = list(prompt_key_to_gold_key.keys())
gold_keys = list(prompt_key_to_gold_key.values())

# for specific baseline we also use these synonyms
synonyms = {
    "Charity Name": ["Charity Name"],
    "Charity Number": [
        "Charity Number",
        "Charity Registration No",
        "Charity No",
    ],
    "Annual Income": ["Annual Income", "Income", "Total Income"],
    "Period End Date": ["Period End Date", "Period End", "Year Ended"],
    "Annual Spending": [
        "Annual Spending",
        "Spending",
        "Total Spending",
        "Expenditure",
    ],
}

# create dict that will record how often the respective key was actually found
key_to_count = {key: 0 for key in prompt_keys}

In [123]:
def get_best_match_span(text: str, key: str):
    """
    Returns the best match for the key in the text with some fuzziness
    (i.e. we limit the levenshtein distance) of the best match.

    (?b) -> BESTMATCH
    (?i) -> IGNORECASE
    {e<n} -> up to n errors (subs, inserts, dels). if more -> None
    (1) -> the span of the best match
    """
    key_length = len(key)
    max_errors = round(key_length * ERROR_PERCENTAGE)
    match_span = regex.search(f"(?b)(?i)({key}){{e<{max_errors}}}", text)

    if match_span:
        return match_span.span(1)

In [124]:
total = 0
for i in range(len(in_df)):
    text = in_df.loc[in_df["filename"] == in_df.iloc[i]["filename"], "text_best_cleaned"].values[0]

    if TYPE_OF_BASELINE == "GENERAL":
        # check for a match for each key
        for i, key in enumerate(prompt_keys):
            total += 1
            match_span = get_best_match_span(text, key)

            if match_span is None:
                continue
            else:
                key_to_count[key] += 1
    
    elif TYPE_OF_BASELINE == "SPECIFIC":
        # check for a match for each synonym
        for i, key in enumerate(list(synonyms.keys())):
            total += 1
            for synonym in synonyms[key]:
                match_span = get_best_match_span(text, synonym)
    
                if match_span is None: # no match for this synonym
                    continue
                else:
                    key_to_count[key] += 1 # found a match for this key
                    break # no need to check the other synonyms for this key as we are only interested if any of them matches


In [125]:
if TYPE_OF_BASELINE == "SPECIFIC":
    num_keys_considered = len(synonyms.values())
    
    # remove the keys that we don't use synonyms for
    for key in prompt_keys:
        if key not in synonyms.keys():
            key_to_count.pop(key)
else:
    num_keys_considered = len(prompt_keys)


key_to_count_ratio = {key: count / total * num_keys_considered for key, count in key_to_count.items()}
print(key_to_count_ratio)

print(f"(macro)[over keys] average Percentage of how often the keys can be found in the docoument: {sum(key_to_count_ratio.values()) / len(key_to_count_ratio)}")
print(f"(micro)[over all predictions] average Percentage of how often the keys can be found in the document: {sum(key_to_count.values()) / total}")

# they are the same which makes sense because all classes (keys) are checked the same amount of times

{'Address (post code)': 0.0022727272727272726, 'Address (street)': 0.004545454545454545, 'Address (post town)': 0.0, 'Charity Name': 0.225, 'Charity Number': 0.75, 'Annual Income': 0.04772727272727273, 'Period End Date': 0.10681818181818181, 'Annual Spending': 0.00909090909090909}
(macro)[over keys] average Percentage of how often the keys can be found in the docoument: 0.14318181818181816
(micro)[over all predictions] average Percentage of how often the keys can be found in the document: 0.1431818181818182


# Evaluating results with own definition of correctness

In [126]:
COLUMN_ORDER = ["raw", "Address (post town)", "Address (post code)", "Address (street)", "Charity Name", "Charity Number", "Annual Income", "Period End Date", "Annual Spending"]

In [127]:
if TYPE_OF_BASELINE == "GENERAL":
    PREDICTION_RUNS_PATHS = ['datasets/kleister_charity/dev-0/predictions/baselines/BaselinePipeline(pdf_to_text_model=KleisterCharityWrapper, model=Baseline(error_percentage=0.18, allowed_entity_range=40), parser=KleisterCharityParser, ner_tagger=en_core_web_sm)_2022-11-24T00-37-43.tsv']

elif TYPE_OF_BASELINE == "SPECIFIC":
    PREDICTION_RUNS_PATHS = ['datasets/kleister_charity/dev-0/predictions/baselines/2023-01-29T18-14-19_BaselinePipeline(pdf_to_text_model=KleisterCharityWrapper, model=SpecificBaseline(error_percentage=0.18, allowed_entity_range=40), parser=KleisterCharityParser, ner_tagger=en_core_web_sm).tsv']

In [128]:
expected = pd.read_csv('datasets/kleister_charity/dev-0/expected.tsv', sep='\t', header=None, names=['raw'])

for key_value_pair in expected["raw"]:
    for key_value in key_value_pair.split(" "):
        key, value = key_value.split("=")
        expected.loc[expected["raw"] == key_value_pair, key] = value

# renaming and sorting for better readability
expected.columns = ["raw", "Address (post town)", "Address (post code)", "Charity Name", "Charity Number", "Period End Date", "Address (street)", "Annual Income",  "Annual Spending"]
expected = expected[COLUMN_ORDER]

expected = expected.drop(columns=["raw"])
expected.head()

Unnamed: 0,Address (post town),Address (post code),Address (street),Charity Name,Charity Number,Annual Income,Period End Date,Annual Spending
0,BROADWAY,WR12_7NL,,Wormington_Village_Society,1155074,,2018-07-31,
1,WESTCLIFF-ON-SEA,SS0_8HX,47_SECOND_AVENUE,Havens_Christian_Hospice,1022119,10348000.0,2016-03-31,9415000.0
2,CHELTENHAM,GL50_3EP,BAYSHILL_ROAD,Cheltenham_Ladies_College,311722,32168000.0,2016-07-31,27972000.0
3,SHREWSBURY,SY3_7PQ,58_TRINITY_STREET,The_Sanata_Charitable_Trust,1132766,255653.0,2015-12-31,258287.0
4,WARE,SG11_2DY,,Cantate_Youth_Choir,1039369,122836.0,2013-12-31,124446.0


In [129]:
prediction_runs_dfs = []
for prediction_run_path in PREDICTION_RUNS_PATHS:
    prediction_run_df = pd.read_csv(prediction_run_path, sep='\t', header=None, names=['raw'], skip_blank_lines=False)

    for raw_prediction in prediction_run_df["raw"]:
        if raw_prediction is np.nan:
            prediction_run_df.loc[prediction_run_df["raw"] == raw_prediction] = np.nan
            continue
        key_value_pairs = raw_prediction.split(" ")
        for key_value in key_value_pairs:
            key, value = key_value.split("=", 1)
            prediction_run_df.loc[prediction_run_df["raw"] == raw_prediction, key] = value

    num_columns = len(prediction_run_df.columns)

    if TYPE_OF_BASELINE == 'GENERAL':
        prediction_column_order = ["raw", "Charity Name", "Charity Number", "Address (post code)", "Annual Income", "Period End Date", "Address (street)", "Annual Spending", "Address (post town)"]
    
    # TODO: be very careful with this, the order of the columns is very important and unfortunately not always the same
    # column_order = ["raw", "Address (post town)", "Address (post code)", "Address (street)", "Charity Name", "Charity Number", "Annual Income", "Period End Date", "Annual Spending"]
    elif TYPE_OF_BASELINE == 'SPECIFIC':
        prediction_column_order = ["raw",  "Address (post code)", "Address (street)", "Charity Name", "Charity Number", "Address (post town)", "Annual Income", "Period End Date", "Annual Spending"]


    # rename columns
    prediction_run_df.columns = prediction_column_order[:num_columns]

    # add any missing columns and fill them with NaN (flan-t5 almost always only predicts first key)
    for column in prediction_column_order[num_columns:]:
        prediction_run_df[column] = np.nan
    
    prediction_run_df = prediction_run_df[COLUMN_ORDER]
    prediction_run_df = prediction_run_df.drop(columns=["raw"])
    prediction_runs_dfs.append(prediction_run_df)

assert len(prediction_runs_dfs) == 1

In [130]:
prediction_runs_dfs[0].head()

Unnamed: 0,Address (post town),Address (post code),Address (street),Charity Name,Charity Number,Annual Income,Period End Date,Annual Spending
0,,,,Wormington_Village_Society_Charity,1155074,,,
1,,,,,1022119,,,
2,,,,,311722,,,
3,,,,,1132766,,,
4,,,,,1039369,,,


In [131]:
def is_correct(key, expected_value, predicted_value):
    """
    Our definition of correctness for each key with the normalisation rules applied.
    """
    upper_cased_expected = str(expected_value).upper()
    upper_cased_predicted = str(predicted_value).upper()

    if key == "Address (post town)":
        if upper_cased_expected.startswith("CITY_OF_") or upper_cased_expected.startswith("TOWN_OF_"):
            upper_cased_expected = upper_cased_expected[8:]
        if upper_cased_predicted.startswith("CITY_OF_") or upper_cased_predicted.startswith("TOWN_OF_"):
            upper_cased_predicted = upper_cased_predicted[8:]
        return distance(upper_cased_expected, upper_cased_predicted, weights=(1, 1, 2)) <= 1

    elif key == "Address (street)":
        upper_cased_expected = re.sub(r"(_)(-)(_)", r"\2", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_)(-)(_)", r"\2", upper_cased_predicted)
        return upper_cased_expected == upper_cased_predicted
    
    elif key == "Charity Name":
        upper_cased_expected = re.sub(r"(_LTD|_LTD.|_LIMITED)$", "", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_LTD|_LTD.|_LIMITED)$", "", upper_cased_predicted)

        upper_cased_expected = re.sub(r"(&)", "and", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(&)", "and", upper_cased_predicted)

        upper_cased_expected = re.sub(r"(_)(-)(_)", r"\2", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_)(-)(_)", r"\2", upper_cased_predicted)
        return distance(upper_cased_expected, upper_cased_predicted, weights=(1, 1, 2)) <= 1
    
    else:
        return upper_cased_expected == upper_cased_predicted

## Normalisations

In [132]:
def replace_quotation_mark(df):
    """
    Replace U+2019 (right single quotation mark) with U+0027 (apostrophe) in a dataframe and return the number of replacements.
    """
    count = 0
    for column in df.columns:
        for index, value in df[column].items():
            if isinstance(value, str):
                if "’" in value:
                    df.loc[index, column] = value.replace("’", "'")
                    count += 1
    return count

for i, prediction_run_df in enumerate(prediction_runs_dfs):
    count = replace_quotation_mark(prediction_run_df)
    print(f"Replaced {count} quotation marks in prediction run {i}.")

count = replace_quotation_mark(expected)
print(f"Replaced {count} quotation marks in expected.")

Replaced 0 quotation marks in prediction run 0.
Replaced 4 quotation marks in expected.


In [133]:
def is_correct(key, expected_value, predicted_value):
    """
    Our definition of correctness for each key with the normalisation rules applied.
    """
    upper_cased_expected = str(expected_value).upper()
    upper_cased_predicted = str(predicted_value).upper()

    if key == "Address (post town)":
        if upper_cased_expected.startswith("CITY_OF_") or upper_cased_expected.startswith("TOWN_OF_"):
            upper_cased_expected = upper_cased_expected[8:]
        if upper_cased_predicted.startswith("CITY_OF_") or upper_cased_predicted.startswith("TOWN_OF_"):
            upper_cased_predicted = upper_cased_predicted[8:]
        return distance(upper_cased_expected, upper_cased_predicted, weights=(1, 1, 2)) <= 1

    elif key == "Address (street)":
        upper_cased_expected = re.sub(r"(_)(-)(_)", r"\2", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_)(-)(_)", r"\2", upper_cased_predicted)
        return upper_cased_expected == upper_cased_predicted
    
    elif key == "Charity Name":
        upper_cased_expected = re.sub(r"(_LTD|_LTD.|_LIMITED)$", "", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_LTD|_LTD.|_LIMITED)$", "", upper_cased_predicted)

        upper_cased_expected = re.sub(r"(&)", "and", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(&)", "and", upper_cased_predicted)

        upper_cased_expected = re.sub(r"(_)(-)(_)", r"\2", upper_cased_expected)
        upper_cased_predicted = re.sub(r"(_)(-)(_)", r"\2", upper_cased_predicted)
        return distance(upper_cased_expected, upper_cased_predicted, weights=(1, 1, 2)) <= 1
    
    else:
        return upper_cased_expected == upper_cased_predicted

In [134]:
official_evaluations = [pd.DataFrame(np.nan, index=expected.index, columns=expected.columns) for _ in range(1)]
own_evaluations = [pd.DataFrame(np.nan, index=expected.index, columns=expected.columns) for _ in range(1)]
null_evaluations = [pd.DataFrame(np.zeros((4, len(expected.columns))), index=["TP", "FP", "FN", "TN"], columns=expected.columns) for _ in range(1)]

for i, prediction_run_df in enumerate(prediction_runs_dfs):
    for index, row in expected.iterrows():
        for column in expected.columns:
            if pd.notnull(row[column]): # because during parsing we look at the generations and if all subdocs are "null" or empty strings, it will not appear in the output and hence be NaN
                # FP: we predicted null and it was not null
                if pd.isnull(prediction_run_df.loc[index, column]):
                    null_evaluations[i].loc["FP", column] += 1

                # TN: we predicted not null (i.e. we predicted something) and it was not null
                else:
                    null_evaluations[i].loc["TN", column] += 1
                if is_correct(column, row[column], prediction_run_df.loc[index, column]):
                    own_evaluations[i].loc[index, column] = 1
                else:
                    own_evaluations[i].loc[index, column] = 0

                if str(row[column]).upper() == str(prediction_run_df.loc[index, column]).upper():
                    official_evaluations[i].loc[index, column] = 1
                else:
                    official_evaluations[i].loc[index, column] = 0

            else: # we don't care about the prediction in our own evaluation if the expected value is null
                # TP: we predicted null and it was null
                if pd.isnull(prediction_run_df.loc[index, column]):
                    null_evaluations[i].loc["TP", column] += 1
                    official_evaluations[i].loc[index, column] = 1

                # FN: we predicted not null and it was null
                else:
                    null_evaluations[i].loc["FN", column] += 1
                    official_evaluations[i].loc[index, column] = 0
                    

In [135]:
# own evaluation: only looks at the keys that are actually present in the document

# we combine the three runs into one by taking the mean (together with the range around the mean (e.g. if we have [1.0, 0.3, 1.7] we get 1.0 as the mean and the range is from 0.3 to 1.7)) of the own evaluations by key of each run
avg_own_evaluation_by_key = pd.concat([own_evaluation.mean(axis=0, skipna=True) for own_evaluation in own_evaluations], axis=1).agg(["mean", "min", "max", lambda x: x.max() - x.min()], axis=1)
print(f"(macro)[over runs] (own) evaluation by key:\n{avg_own_evaluation_by_key}")
print(f"(macro)[over runs and keys] (own) average of correctly predicted values: {round(avg_own_evaluation_by_key['mean'].agg('mean'), 3)}")
print(f"(macro)[over runs and keys] (own) range of correctly predicted values: {round(avg_own_evaluation_by_key['<lambda>'].agg('mean'), 3)}")

micro_averaged_accuracy = []
for i, own_evaluation in enumerate(own_evaluations):
    micro_averaged_accuracy.append(own_evaluation.sum().sum() / own_evaluation.count().sum())

print(f"(micro)[over all key-value pairs] (own) average of correctly predicted values: {round(np.mean(micro_averaged_accuracy), 3)}")
print(f"(micro)[over all key-value pairs] (own) sample standard deviation of correctly predicted values: {round(np.std(micro_averaged_accuracy), 3)}")

# official evaluation (same as above but with the official evaluation)
avg_official_evaluation_by_key = pd.concat([official_evaluation.mean(axis=0, skipna=True) for official_evaluation in official_evaluations], axis=1).agg(["mean", "min", "max", lambda x: x.max() - x.min()], axis=1)
print(f"(macro)[over runs] (official) evaluation by key:\n{avg_official_evaluation_by_key}")
print(f"(macro)[over runs and keys] (official) average of correctly predicted values: {round(avg_official_evaluation_by_key['mean'].agg('mean'), 3)}")
print(f"(macro)[over runs and keys] (official) range of correctly predicted values: {round(avg_official_evaluation_by_key['<lambda>'].agg('mean'), 3)}")

micro_averaged_accuracy = []
for i, official_evaluation in enumerate(official_evaluations):
    micro_averaged_accuracy.append(official_evaluation.sum().sum() / official_evaluation.count().sum())

print(f"(micro)[over all key-value pairs] (official) average of correctly predicted values: {round(np.mean(micro_averaged_accuracy), 3)}")
print(f"(micro)[over all key-value pairs] (official) sample standard deviation of correctly predicted values: {round(np.std(micro_averaged_accuracy), 3)}")

(macro)[over runs] (own) evaluation by key:
                         mean       min       max  <lambda>
Address (post town)  0.000000  0.000000  0.000000       0.0
Address (post code)  0.000000  0.000000  0.000000       0.0
Address (street)     0.000000  0.000000  0.000000       0.0
Charity Name         0.036364  0.036364  0.036364       0.0
Charity Number       0.562929  0.562929  0.562929       0.0
Annual Income        0.000000  0.000000  0.000000       0.0
Period End Date      0.002273  0.002273  0.002273       0.0
Annual Spending      0.000000  0.000000  0.000000       0.0
(macro)[over runs and keys] (own) average of correctly predicted values: 0.075
(macro)[over runs and keys] (own) range of correctly predicted values: 0.0
(micro)[over all key-value pairs] (own) average of correctly predicted values: 0.077
(micro)[over all key-value pairs] (own) sample standard deviation of correctly predicted values: 0.0
(macro)[over runs] (official) evaluation by key:
                         me