# Evaluation

In [125]:
from rouge_score import rouge_scorer
import glob
import os
from collections import defaultdict
import json
import numpy as np

In [62]:
ELITR_DATASET_DIR = "../../datasets/ELITR Minuting Corpus/ELITR-minuting-corpus/elitr-minuting-corpus-en"
EUROPARL_DATASET_DIR = "../../datasets/europarlmin"

ELITR_TESTING_SUBDIRS = ["dev", "test", "test2"]
EUROPARL_TESTING_SUBDIRS = ["dev"]

MINUTES_DIR = "minutes"
ELITR_DIR = "elitr/en"
EUROPARL_DIR = "europarl"

EVALUATION_DIR = "evaluation"

## Load ref minutes

In [28]:
def load_europarl_ref_minutes(dataset_dir, testing_subdirs):
    ref_minutes = defaultdict(dict)

    for testing_subdir in testing_subdirs:
        minutes_paths = sorted(glob.glob(os.path.join(dataset_dir, testing_subdir, "*", "min*.txt")))

        for minute_path in minutes_paths:
            minute_id = os.path.basename(minute_path).lstrip("min-").rstrip(".txt")

            with open(minute_path, "r") as f:
                minute = f.read()
                ref_minutes[testing_subdir][minute_id] = minute

    return ref_minutes

In [29]:
europarl_ref = load_europarl_ref_minutes(EUROPARL_DATASET_DIR, EUROPARL_TESTING_SUBDIRS)
europarl_ref

defaultdict(dict,
            {'dev': {'2006-10-11-ch002-00': "2. Tribute\n\nOn behalf of Parliament, the President paid tribute to the memory of Antoni\nGutirrez Daz - a Member of the European Parliament between 1987 and 1999\nand Vice-President between 1992 and 1994 - who had died on 6 October 2006.\n\nParliament observed a minute's silence.\n\n\n",
              '2006-10-11-ch003-00': "3. Statement by the President\n\nThe President made a statement on the murder of the Russian journalist Anna\nPolitkovskaya in Moscow on 7 October 2006. He pointed out that he had\ncalled on the Russian authorities to do everything within their power to\nfind out who was responsible for the murder. He added that he had passed on\nParliament's condolences to Mrs Politkovskaya's family and friends.\nFinally, the President pointed out that 63 journalists had been killed\nacross the world since the beginning of the year and that 120 journalists\nwere in prison.\n\nParliament observed a minute's silence.\n

In [80]:
def load_elitr_ref_minutes(dataset_dir, testing_subdirs):
    ref_minutes = defaultdict(lambda: defaultdict(list))

    for testing_subdir in testing_subdirs:
        minutes_paths = sorted(glob.glob(os.path.join(dataset_dir, testing_subdir, "*", "minutes_GENER*.txt")))

        for minute_path in minutes_paths:
            minute_id = os.path.basename(os.path.dirname(minute_path))

            with open(minute_path, "r") as f:
                minute = f.read()
                ref_minutes[testing_subdir][minute_id].append(minute)

    return ref_minutes

In [81]:
elitr_ref = load_elitr_ref_minutes(ELITR_DATASET_DIR, ELITR_TESTING_SUBDIRS)
elitr_ref

defaultdict(<function __main__.load_elitr_ref_minutes.<locals>.<lambda>()>,
            {'dev': defaultdict(list,
                         {'meeting_en_dev_001': ['\nDate: 2020/10/13\nAttendees: [PERSON10], [PERSON3], [PERSON6], [PERSON5], [PERSON4]\nPurpose of meeting: Including [ORGANIZATION1] transcripts and minutes.\n\nSummary of meeting:\n\n[PERSON3], [PERSON10], [PERSON6]\n- Discuss including of [ORGANIZATION1] transcripts and minutes.\n- Draws attention to differences from actual data and possibility to have this as a contrast of task.\n- Decision to include one separate track if [ORGANIZATION1] comes for free and if it is only matter of downloading some of the data and polishing it a little, no manual processing of that.\n- There should be done [ORGANIZATION1] minutes corpus preparation, the pairs of transcripts and minutes.\n- Agreed on one shared task assignment with 3 underlying data sets, where the first test set would be own corpus, second test set would be [ORGANIZATION1]

## Compute ROUGE scores

In [105]:
def __get_scores(testing_subdirs, dataset_dir, score_func, ref):
    scores = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    for model in sorted(os.listdir(MINUTES_DIR)):
        for testing_subdir in testing_subdirs:
            subdir_path = os.path.join(MINUTES_DIR, model, dataset_dir, testing_subdir)

            for meeting_id in sorted(os.listdir(subdir_path)):
                meeting_id_path = os.path.join(subdir_path, meeting_id)

                ref_minutes = ref[testing_subdir][meeting_id]

                for minute_file in sorted(os.listdir(meeting_id_path)):
                    length = minute_file.rstrip(".txt")

                    with open(os.path.join(meeting_id_path, minute_file), "r") as f:
                        minute = f.read()

                    score = score_func(ref_minutes, minute)
                    scores[f"{model}_{length}"][testing_subdir][meeting_id] = score

    return scores

def get_scores(dataset):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

    if dataset == "elitr":
        testing_subdirs = ELITR_TESTING_SUBDIRS
        dataset_dir = ELITR_DIR
        score_func = scorer.score_multi
        ref = elitr_ref
    else:
        testing_subdirs = EUROPARL_TESTING_SUBDIRS
        dataset_dir = EUROPARL_DIR
        score_func = scorer.score
        ref = europarl_ref

    return __get_scores(testing_subdirs, dataset_dir, score_func, ref)

In [107]:
elitr_scores = get_scores("elitr")
elitr_scores

defaultdict(<function __main__.__get_scores.<locals>.<lambda>()>,
            {'MEETING_SUMMARY_length_1024': defaultdict(<function __main__.__get_scores.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'dev': defaultdict(dict,
                                      {'meeting_en_dev_001': {'rouge1': Score(precision=0.2722007722007722, recall=0.6878048780487804, fmeasure=0.39004149377593356),
                                        'rouge2': Score(precision=0.06963249516441006, recall=0.17647058823529413, fmeasure=0.0998613037447989),
                                        'rougeL': Score(precision=0.13127413127413126, recall=0.33170731707317075, fmeasure=0.18810511756569848)},
                                       'meeting_en_dev_002': {'rouge1': Score(precision=0.56, recall=0.4117647058823529, fmeasure=0.4745762711864407),
                                        'rouge2': Score(precision=0.1489971346704871, recall=0.10947368421052632, fmeasure=0.1262135922330097),
  

In [111]:
with open(os.path.join(EVALUATION_DIR, "elitr.json"), "w") as f:
    json.dump(elitr_scores, f, indent=4)

In [112]:
europarl_scores = get_scores("europarl")
europarl_scores

defaultdict(<function __main__.__get_scores.<locals>.<lambda>()>,
            {'MEETING_SUMMARY_length_1024': defaultdict(<function __main__.__get_scores.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'dev': defaultdict(dict,
                                      {'2006-10-11-ch002-00': {'rouge1': Score(precision=0.4, recall=0.2553191489361702, fmeasure=0.3116883116883116),
                                        'rouge2': Score(precision=0.2413793103448276, recall=0.15217391304347827, fmeasure=0.18666666666666665),
                                        'rougeL': Score(precision=0.23333333333333334, recall=0.14893617021276595, fmeasure=0.18181818181818182)},
                                       '2006-10-11-ch003-00': {'rouge1': Score(precision=0.5, recall=0.15841584158415842, fmeasure=0.24060150375939848),
                                        'rouge2': Score(precision=0.1935483870967742, recall=0.06, fmeasure=0.0916030534351145),
                              

In [113]:
with open(os.path.join(EVALUATION_DIR, "europarl.json"), "w") as f:
    json.dump(europarl_scores, f, indent=4)

## Compute avg ROUGE scores

In [162]:
def __avg_scores(scores):
    avg_scores = {score_name: np.mean(score_vals) for score_name, score_vals in scores.items()}
    avg_scores["avg_rouge"] = np.mean([score for score in avg_scores.values()])

    return avg_scores

def compute_avg_scores(dataset_scores):
    avg_scores = defaultdict(lambda: defaultdict(dict))

    for model, model_scores in dataset_scores.items():
        all_scores = defaultdict(list)

        for subdir, subdir_scores in model_scores.items():
            scores = defaultdict(list)

            for meeting_score in subdir_scores.values():
                for score_name, score_val in meeting_score.items():
                    scores[score_name].append(score_val.fmeasure)
                    all_scores[score_name].append(score_val.fmeasure)

            avg_scores[model][subdir] = __avg_scores(scores)
        avg_scores[model]["all"] = __avg_scores(all_scores)

    return avg_scores

In [163]:
avg_elitr_scores = compute_avg_scores(elitr_scores)
avg_elitr_scores

defaultdict(<function __main__.compute_avg_scores.<locals>.<lambda>()>,
            {'MEETING_SUMMARY_length_1024': defaultdict(dict,
                         {'dev': {'rouge1': 0.3586676195635613,
                           'rouge2': 0.10181949160763178,
                           'rougeL': 0.1880787810842802,
                           'avg_rouge': 0.2161886307518244},
                          'test': {'rouge1': 0.37311435836403345,
                           'rouge2': 0.09721771070416493,
                           'rougeL': 0.18900499015289426,
                           'avg_rouge': 0.2197790197403642},
                          'test2': {'rouge1': 0.41838795180922805,
                           'rouge2': 0.11108731127318655,
                           'rougeL': 0.1945227389116087,
                           'avg_rouge': 0.2413326673313411},
                          'all': {'rouge1': 0.37916217390727885,
                           'rouge2': 0.10157811663713275,
                 

In [164]:
with open(os.path.join(EVALUATION_DIR, "avg_elitr.json"), "w") as f:
    json.dump(avg_elitr_scores, f, indent=4)

In [165]:
avg_europarl_scores = compute_avg_scores(europarl_scores)
avg_europarl_scores

defaultdict(<function __main__.compute_avg_scores.<locals>.<lambda>()>,
            {'MEETING_SUMMARY_length_1024': defaultdict(dict,
                         {'dev': {'rouge1': 0.19760616162189448,
                           'rouge2': 0.06646917889294131,
                           'rougeL': 0.13342034639493375,
                           'avg_rouge': 0.13249856230325652},
                          'all': {'rouge1': 0.19760616162189448,
                           'rouge2': 0.06646917889294131,
                           'rougeL': 0.13342034639493375,
                           'avg_rouge': 0.13249856230325652}}),
             'MEETING_SUMMARY_length_512': defaultdict(dict,
                         {'dev': {'rouge1': 0.22545964455542258,
                           'rouge2': 0.07235594732940467,
                           'rougeL': 0.144698509276687,
                           'avg_rouge': 0.14750470038717142},
                          'all': {'rouge1': 0.22545964455542258,
           

In [166]:
with open(os.path.join(EVALUATION_DIR, "avg_europarl.json"), "w") as f:
    json.dump(avg_europarl_scores, f, indent=4)