In [564]:
import os
import re
from lxml import etree
import json

OUTPUT_DIR = "preprocessed_data/europarl"

TRAIN_DATA_PATH = '../../datasets/europarlmin/train/'
DEV_DATA_PATH = '../../datasets/europarlmin/dev/'
TEST_DATA_PATH = '../../datasets/automin-2023-data/Europarlmin/test1/'

In [565]:
def read_data_from_files(data_path):
    transcripts = {}
    data_folders = [os.path.basename(x[0]) for x in os.walk(data_path) if len(os.path.basename(x[0])) > 0]
    for directory in sorted(data_folders):
        for file_name in sorted(os.listdir(os.path.join(data_path, directory))):
            file_path = os.path.join(data_path, directory, file_name)
            with open(file_path, "r") as f:
                if 'ep' in file_name:
                    date = re.search('ep-(.+?).txt', file_name)[1]
                    transcripts[date] = f.read().splitlines()
    return transcripts

In [581]:
def remove_tags(text):
    # Paragraph tag
    text = re.sub(r'^\s*<P>\s*$', '', text)
    # Notes
    text = re.sub(r'^\s*\(.+\)\s*$', '', text)
    # Before the vote lines
    text = "" if re.match(r"^.*Before the.*vote.*$", text) else text

    return text

def normalize_utterance(utterance):
    # Remove info from start (before the first occurence of '. -' or '. –')
    utterance = re.sub(r"^.*?\.\s?(-|–)", "", utterance)

    # Remove lang code from start
    utterance = re.sub(r"^\s*\W*\s*\([A-Z][A-Z]\)\s*\W*\s*", "", utterance)

    # Remove punctuation from start
    utterance = re.sub(r"^\s*\W*\s*", "", utterance)

    # Normalize whitespaces
    utterance = " ".join(utterance.split()).strip()

    return utterance

def preprocess_transcripts(transcripts):
    preprocessed_transcripts = {}

    for key, value in transcripts.items():
        utterances = []
        roles = []

        # Remove tag lines
        value = [remove_tags(line) for line in value]

        speaker_details = {index: tag for index, tag in enumerate(value) if 'SPEAKER' in tag}
        speaker_tag_indexes = list(speaker_details.keys())

        for i in range(len(speaker_tag_indexes)):
            current_index = speaker_tag_indexes[i]
            next_speaker_index = speaker_tag_indexes[i+1] if i+1 < len(speaker_tag_indexes) else None
            end_tag = '</SPEAKER>' if '/>' not in speaker_details[current_index] else ''
            speaker_id = etree.fromstring(speaker_details[current_index]+end_tag).attrib['ID']

            role = 'PERSON' + speaker_id
            utterance = ' '.join(value[current_index+1: next_speaker_index])
            utterance = normalize_utterance(utterance)

            if len(utterance) > 0:
                roles.append(role)
                utterances.append(utterance)

        assert len(utterances) == len(roles)
        preprocessed_transcripts[key] = {"roles": roles, "utterances": utterances}

    return preprocessed_transcripts

In [582]:
def save_preprocessed_data(file_path, preprocessed_transcripts):
    with open(file_path, 'w') as f:
        json.dump(preprocessed_transcripts, f, indent=4, ensure_ascii=False)

def preprocess_dataset(dataset_path, preprocessed_file_path):
    transcripts = read_data_from_files(dataset_path)
    preprocessed_transcripts =  preprocess_transcripts(transcripts)
    save_preprocessed_data(preprocessed_file_path, preprocessed_transcripts)

In [583]:
preprocess_dataset(TRAIN_DATA_PATH, os.path.join(OUTPUT_DIR, "train.json"))
preprocess_dataset(DEV_DATA_PATH, os.path.join(OUTPUT_DIR, "dev.json"))
preprocess_dataset(TEST_DATA_PATH, os.path.join(OUTPUT_DIR, "test1.json"))