In [1]:
import sys, pathlib

# add open_asr_leaderboard project root to sys.path
ROOT = pathlib.Path().resolve().parent          # one level up from transformers
sys.path.append(str(ROOT))

In [2]:
import json
import os

from normalizer.data_utils import normalizer as data_utils_normalizer
normalizer = data_utils_normalizer


def create_filtered_file(special_words_name, input_file_name):
    """
    Filters a JSONL file based on a list of special words, adds the matched
    words to each JSON object, and creates a new filtered file.
    """
    INPUT_FILE_PATH = f"results/{input_file_name}.jsonl"
    base, ext = os.path.splitext(INPUT_FILE_PATH)
    if "jmci-aispeak-v1" in input_file_name:
        output_file_path = f"{base}-{special_words_name}.jsonl".replace("jmci-aispeak-v1", special_words_name.replace("_", "-"))
    else:
        output_file_path = f"{base}-{special_words_name}.jsonl".replace("aquavoice-cleaned", special_words_name.replace("_", "-"))

    # 1. Load and normalize special words
    SPECIAL_WORDS_PATH = f"{special_words_name}.csv"
    print(f"Loading special words from {SPECIAL_WORDS_PATH}...")
    try:
        with open(SPECIAL_WORDS_PATH, 'r', encoding='utf-8') as f:
            special_words = [line.strip().rstrip(',') for line in f if line.strip()]
            normalized_special_words = {normalizer(word) for word in special_words}
            print(f"Loaded and normalized {len(normalized_special_words)} special words.")
    except FileNotFoundError:
        print(f"Error: Special words file not found at '{SPECIAL_WORDS_PATH}'")
        return

    # 2. Filter the input file and add matched words
    print(f"Filtering {INPUT_FILE_PATH}...")
    lines_written = 0
    try:
        with open(INPUT_FILE_PATH, 'r', encoding='utf-8') as infile, \
             open(output_file_path, 'w', encoding='utf-8') as outfile:

            for line in infile:
                try:
                    data = json.loads(line)
                    text = data.get('text', '')

                    if not text:
                        continue

                    normalized_text = normalizer(text)
                    words_in_text = set(normalized_text.split())

                    # Find which special words are in the text
                    matched_words = words_in_text.intersection(normalized_special_words)

                    if matched_words:
                        # Add the matched words to the JSON object
                        data['matched_special_words'] = sorted(list(matched_words))
                        # Write the modified object to the output file
                        outfile.write(json.dumps(data) + '\n')
                        lines_written += 1

                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON line: {line.strip()}")
                    continue
    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_FILE_PATH}'")
        return

    print("\nFiltering complete.")
    print(f" - Total lines written: {lines_written}")
    print(f" - Filtered file saved to: {os.path.abspath(output_file_path)}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
create_filtered_file("global_dict_10", "MODEL_openai-gpt-4o-transcribe_DATASET_jmci-aispeak-v1_default_test")
create_filtered_file("global_dict_50", "MODEL_openai-gpt-4o-transcribe_DATASET_jmci-aispeak-v1_default_test")
create_filtered_file("global_dict_500", "MODEL_openai-gpt-4o-transcribe_DATASET_jmci-aispeak-v1_default_test")


Loading special words from global_dict_10.csv...
Loaded and normalized 10 special words.
Filtering results/MODEL_openai-whisper-large_DATASET_jmci-aispeak-v1_default_test.jsonl...

Filtering complete.
 - Total lines written: 1278
 - Filtered file saved to: /lambda/nfs/jtm/open_asr_leaderboard/transformers/results/MODEL_openai-whisper-large_DATASET_global-dict-10_default_test-global_dict_10.jsonl
Loading special words from global_dict_50.csv...
Loaded and normalized 48 special words.
Filtering results/MODEL_openai-whisper-large_DATASET_jmci-aispeak-v1_default_test.jsonl...

Filtering complete.
 - Total lines written: 3652
 - Filtered file saved to: /lambda/nfs/jtm/open_asr_leaderboard/transformers/results/MODEL_openai-whisper-large_DATASET_global-dict-50_default_test-global_dict_50.jsonl
Loading special words from global_dict_500.csv...
Loaded and normalized 569 special words.
Filtering results/MODEL_openai-whisper-large_DATASET_jmci-aispeak-v1_default_test.jsonl...

Filtering complete.