# **Pre-processing Multi-turn Dialogues from MultiWOZ2.2**

In this section, we preprocess the MultiWOZ 2.2 dataset for End-of-Utterance (EOU) detection. The steps are as follows:

1. Extracting Context: We extract the last 3 turns of each dialogue (user, assistant, user) as the context.
2. Formatting Data: The dialogues are restructured into a JSON format containing context (dialogue turns) and a label (1 for EOU, 0 for non-EOU).
3. Dataset Splitting: The dataset is split into two parts: a training set (dataset_train.json) and a test set (dataset_test.json).


In [13]:
import os
print(os.listdir("/content"))  # This will list all files in the /content directory

['.config', 'README.md', '.ipynb_checkpoints', 'dialog_acts.json', 'dataset_test.json', 'train', 'multiwoz_output.json', 'dataset_train.json', 'test', 'schema.json', 'requirements.txt', 'dev', 'sample_data']


In [9]:
import json
import os

# Handle extracting relevant dialogue context from the data
def extract_dialogue_context(dialogue_data):
    formatted_data = []

    # Iterate through each dialogue
    for dialogue_info in dialogue_data:
        # Extract dialogue turns safely with a fallback in case 'turns' is missing
        dialogue_turns = dialogue_info.get("turns", [])
        if not dialogue_turns:
            print(f"Warning: 'turns' field is missing or empty in dialogue: {dialogue_info.get('dialogue_id', 'Unknown')}")

        dialogue_context = []
        label = 0  # Default label is 0, can be manually annotated later

        # We only need the first 3 turns (User, Assistant, User)
        # Process the first 3 turns in the dialogue
        for i, turn in enumerate(dialogue_turns[:3]):
            speaker = turn.get("speaker", "")
            utterance = turn.get("utterance", "")

            # Safely append the user/system utterances to the context
            if speaker == "USER":
                dialogue_context.append({"role": "user", "content": utterance})
            elif speaker == "SYSTEM":
                dialogue_context.append({"role": "assistant", "content": utterance})

            # Check for End of Utterance (EOU) based on some dialogue act signals (e.g., "general-bye")
            frames = turn.get("frames", [])

            # Check if frames is non-empty and then check for 'actions'
            if frames:
                dialog_act = frames[0].get("actions", [])
                if any("general-bye" in action for action in dialog_act):
                    label = 1  # EOU detected

        # After processing the dialogue turns, store the context and label
        if dialogue_context:
            formatted_data.append({
                "context": dialogue_context,
                "label": label  # Initially set as 0, you can change it later
            })

    return formatted_data

def main():
    # Directories containing the dialogue JSON files for training and testing
    train_folder = '/content/train'
    test_folder = '/content/test'

    # Separate lists for train and test data
    train_data = []
    test_data = []

    # Process training data (dialogues_001.json to dialogues_017.json)
    for i in range(1, 18):  # From 001 to 017
        dialogue_file = os.path.join(train_folder, f'dialogues_{i:03d}.json')  # Format with leading zeros

        # Check if the file exists
        if os.path.exists(dialogue_file):
            with open(dialogue_file, "r") as f:
                dialogue_data = json.load(f)

            # Extract dialogue context for each file
            train_data.extend(extract_dialogue_context(dialogue_data))
        else:
            print(f"File {dialogue_file} not found.")

    # Process testing data (dialogues_xxx.json in /content/test folder)
    for filename in os.listdir(test_folder):
        if filename.startswith("dialogues_") and filename.endswith(".json"):
            dialogue_file = os.path.join(test_folder, filename)

            # Check if the file exists
            if os.path.exists(dialogue_file):
                with open(dialogue_file, "r") as f:
                    dialogue_data = json.load(f)

                # Extract dialogue context for each file
                test_data.extend(extract_dialogue_context(dialogue_data))
            else:
                print(f"File {dialogue_file} not found.")

    # Save the results to dataset_train.json
    train_dataset_path = '/content/dataset_train.json'
    with open(train_dataset_path, 'w') as f:
        json.dump(train_data, f, indent=2)

    # Save the results to dataset_test.json
    test_dataset_path = '/content/dataset_test.json'
    with open(test_dataset_path, 'w') as f:
        json.dump(test_data, f, indent=2)

if __name__ == "__main__":
    main()

# **Printing Our Dataset**

In [12]:
import json

# Load the dataset.json file
dataset_path = '/content/dataset_train.json'
with open(dataset_path, 'r') as file:
    dataset = json.load(file)

# Print the total number of entries
print(f"Total entries in the training dataset: {len(dataset)}")

# Load the dataset.json file
dataset_path = '/content/dataset_test.json'
with open(dataset_path, 'r') as file:
    dataset = json.load(file)

# Print the total number of entries
print(f"Total entries in the test dataset: {len(dataset)}")

Total entries in the training dataset: 8437
Total entries in the test dataset: 1000
