Necessary (external) libraries:
- jiwer
- python-docx


In [1]:
import jiwer
#import batchalign
import os
import pathlib
from docx import Document
import re
import pandas as pd

#pd.set_option("display.max_columns", 50) 
#pd.set_option("display.max_rows", 50)


# Data Preprocessing

## Setting up the Paths

In [2]:
#setting up the paths

path = pathlib.Path().resolve()

#path to the directory with the Batchalign transcriptions
hypothesis_path = path / "transkrypty_czyszczonych_nagrań"

#path to the directory with the human transcriptions
reference_path = path / "czyszczone_transkrypcje"

#path to the directory where the copy of the human transcriptions is saved
reference_path_copy = path / "czyszczone_transkrypcje_copy"

#path to the directory where the copy of the Batchalign transcriptions is saved
hypothesis_path_copy = path / "transkrypty_czyszczonych_nagrań_copy"


#### Script for reseting the files inside the reference_path

In [None]:
# #ONLY RUN THIS SCRIPT IF YOU WANT TO RESET THE REFERENCE PATH

# # to unncomment the code, select it and press Ctrl + / (or Cmd + / on Mac) 
# # This script will delete all files in the reference_path and copy files from reference_path_copy to reference_path

# import shutil

# # 1. Delete all files from reference_path
# for file in reference_path.iterdir():
#     if file.is_file():
#         file.unlink()

# # 2. Copy all files from reference_path_copy to reference_path
# for file in reference_path_copy.iterdir():
#     if file.is_file():
#         target = reference_path / file.name
#         shutil.copy2(file, target)

#### Script for reseting the files inside the hypothesis_path

In [None]:
# #ONLY RUN THIS SCRIPT IF YOU WANT TO 'RESET' THE HYPOTHESIS PATH

# # to unncomment the code, select it and press Ctrl + / (or Cmd + / on Mac) 
# # This script will delete all files in the hypothesis_path and copy files from hypothesis_path_copy to hypothesis_path

# import shutil

# # 1. Delete all files from hypothesis_path
# for file in hypothesis_path.iterdir():
#     if file.is_file():
#         file.unlink()

# # 2. Copy all files from hypothesis_path_copy to hypothesis_path
# for file in hypothesis_path_copy.iterdir():
#     if file.is_file():
#         target = hypothesis_path / file.name
#         shutil.copy2(file, target)

## Preprocessing the reference_path

### Converting the reference files to .txt

In [3]:
#converting all files in the reference_path to a .txt format

def convert_docx_or_cha_to_txt(file_path):
    """
    
    !!!!!
    Remember to have the backup of the original files before running this function, as it will delete the original files after conversion
    !!!!! 

    Converts .docx or .cha files to .txt format and deletes the original file.

    """
    if file_path.suffix.lower() == '.docx':
        new_file_path = file_path.with_suffix('.txt')
        try:
            doc = Document(file_path)
            content = "\n".join([para.text for para in doc.paragraphs])
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            os.remove(file_path)
            print(f"Converted and removed: {file_path}")
            return new_file_path
        except Exception as e:
            print(f"Error converting {file_path}: {e}")
            return file_path
        
    elif file_path.suffix.lower() == '.cha':
        new_file_path = file_path.with_suffix('.txt')
        try:
            with open(file_path, 'r', encoding='utf-8') as f_in:
                content = f_in.read()
            with open(new_file_path, 'w', encoding='utf-8') as f_out:
                f_out.write(content)
            os.remove(file_path)
            print(f"Converted and removed: {file_path}")
            return new_file_path
        except Exception as e:
            print(f"Error converting {file_path}: {e}")
            return file_path
    return file_path

In [4]:
# Run the function for all .docx and .cha files in the reference_path
for file in os.listdir(reference_path):
    file_path = reference_path / file
    if file_path.is_file() and file_path.suffix.lower() in ['.docx', '.cha']:
        convert_docx_or_cha_to_txt(file_path)

Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf001_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf002_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf003_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf006_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\bf008_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\bf009_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf010_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf012_NAp_transkrypcja_clean.docx
Converted and removed: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf014_NAp_transkrypcja_clean.docx
Converted 

### Cleaning the reference files

In [5]:
# Cleaning the files in the reference_path

def clean_transcript_file(file_path):
    """
    Cleans transcript files by:
    - Removing all lines before the first *CHI: or *EXP:
    - Removing all lines starting with %com: anywhere in the file
    - Removing trailing empty lines and lines like '@End' or similar markers at the end
    Overwrites the file in place.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Find the index of the first line starting with *CHI: or *EXP:
    start_idx = next(
        (i for i, line in enumerate(lines) if line.lstrip().startswith("*CHI:") or line.lstrip().startswith("*EXP:")),
        None
    )

    if start_idx is not None:
        # Remove %com: lines after trimming the start
        cleaned_lines = [
            line for line in lines[start_idx:]
            if not line.lstrip().startswith("%com:")
        ]
        # Remove trailing empty lines and lines like '@End'
        while cleaned_lines and (cleaned_lines[-1].strip() == "" or cleaned_lines[-1].strip().lower() == "@end"):
            cleaned_lines.pop()
        with open(file_path, 'w', encoding='utf-8') as f:
            f.writelines(cleaned_lines)
        print(f"Cleaned: {file_path}")
    else:
        print(f"No *CHI: or *EXP: found in {file_path}")

In [6]:
# Running the cleaning function for all .txt files in the reference_path
reference_path = pathlib.Path().resolve() / "czyszczone_transkrypcje"
for file in os.listdir(reference_path):
    file_path = reference_path / file
    if file_path.is_file() and file_path.suffix.lower() == '.txt':
        clean_transcript_file(file_path)

Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf001_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf002_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf003_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf006_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\bf008_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\bf009_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf010_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf012_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf014_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_transkrypcje\Bf015_NAp_transkrypcja_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\czyszczone_t

## Preprocessing the hypothesis_path

### Converting the hypothesis files to .txt

In [7]:
# converting all files in the hypothesis_path to a .txt format

def convert_cha_to_txt(file_path):
    """
    
    !!!!!
    Remember to have the backup of the original files before running this function, as it will delete the original files after conversion
    !!!!!

    Converts .cha files to .txt format and deletes the original file.


    """
    if file_path.suffix.lower() == '.cha':
        new_file_path = file_path.with_suffix('.txt')
        try:
            with open(file_path, 'r', encoding='utf-8') as f_in:
                content = f_in.read()
            with open(new_file_path, 'w', encoding='utf-8') as f_out:
                f_out.write(content)
            os.remove(file_path)
            print(f"Converted and removed: {file_path}")
            return new_file_path
        except Exception as e:
            print(f"Error converting {file_path}: {e}")
            return file_path
    return file_path

In [8]:
# Run the function for all .cha files in the hypothesis_path
for file in os.listdir(hypothesis_path):
    file_path = hypothesis_path / file
    if file_path.is_file() and file_path.suffix.lower() == '.cha':
        convert_cha_to_txt(file_path)

Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf012_NAp_сlean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0134_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0137_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0148_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf062_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf063_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf068_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf069_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf074_NAp_clean.cha
Converted and removed: D:\GitHub\Repos\Narracje-ASR\transkryp

### Cleaning the hypothesis files

In [9]:
def clean_hypothesis_file(file_path):
    """
    Cleans hypothesis files by:
    - Removing all lines before the first line starting with *PAR (e.g., *PAR0:, *PAR1:, etc.)
    - Removing all timestamp patterns ->  XXXX_XXXX
    - Removing the last line if it is '@End' (case-insensitive) and any trailing empty lines
    Overwrites the file in place.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Find the index of the first line starting with *PAR
    start_idx = next(
        (i for i, line in enumerate(lines) if line.lstrip().startswith("*PAR")),
        None
    )

    if start_idx is not None:
        cleaned_lines = lines[start_idx:]
        # Remove timestamp patterns
        cleaned_lines = [re.sub(r"\x15\d+_\d+\x15", "", line) for line in cleaned_lines]
        
        
        # Remove trailing lines that are empty or '@End'
        while cleaned_lines and (cleaned_lines[-1].strip() == "" or cleaned_lines[-1].strip().lower() == "@end"):
            cleaned_lines.pop()
        with open(file_path, 'w', encoding='utf-8') as f:
            f.writelines(cleaned_lines)
        print(f"Cleaned: {file_path}")
    else:
        print(f"No *PAR found in {file_path}")

In [10]:
# Running the cleaning function for all .txt files in the hypothesis_path
hypothesis_path = pathlib.Path().resolve() / "transkrypty_czyszczonych_nagrań"
for file in os.listdir(hypothesis_path):
    file_path = hypothesis_path / file
    if file_path.is_file() and file_path.suffix.lower() == '.txt':
        clean_hypothesis_file(file_path)

Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf012_NAp_сlean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0134_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0137_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf0148_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf062_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf063_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf068_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf069_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf074_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf075_NAp_clean.txt
Cleaned: D:\GitHub\Repos\Narracje-ASR\transkrypty_czyszczonych_nagrań\Bf076_NAp_clean.txt
Cleaned

# Data analysis

## Data transformation

In [11]:
#transformation pipeline

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(), #expands common English contractions like "don't" to "do not"
        jiwer.RemoveEmptyStrings(), #removes empty strings
        jiwer.ToLowerCase(), #converts all characters to lowercase
        jiwer.RemoveMultipleSpaces(), #removes multiple spaces between words
        jiwer.Strip(), #removes leading and trailing spaces
        jiwer.RemovePunctuation(), #removes punctuation marks (e.g., ., !, ?, etc.)
        #jiwer.RemoveKaldiNonWords(), #removes Kaldi non-words (e.g., <unk>, <s>, </s>, etc.)
        jiwer.ReduceToListOfListOfWords(), #reduces the transcription to a list of lists of words
    ]
)

In [12]:
# List all .txt files in both folders
ref_files = [f for f in os.listdir(reference_path) if f.endswith('.txt')]
hyp_files = [f for f in os.listdir(hypothesis_path) if f.endswith('.txt')]

def extract_key(filename):
    # Match 2 letters, 3 or 4 digits, underscore, NAp (e.g., Ab123_NAp or Ab1234_NAp)
    match = re.match(r"^[A-Za-z]{2}\d{3,4}_NAp", filename)
    if match:
        return match.group(0)
    return filename  # fallback: whole filename if not matched

df_ref = pd.DataFrame({
    'key': [extract_key(f) for f in ref_files],
    'ref_file': ref_files
})
df_hyp = pd.DataFrame({
    'key': [extract_key(f) for f in hyp_files],
    'hyp_file': hyp_files
})

# Merge to get only matching pairs
df = pd.merge(df_ref, df_hyp, on='key')

display(df)

Unnamed: 0,key,ref_file,hyp_file
0,Bf012_NAp,Bf012_NAp_transkrypcja_clean.txt,Bf012_NAp_сlean.txt
1,Bf062_NAp,Bf062_NAp_transkrypcja_clean.txt,Bf062_NAp_clean.txt
2,Bf063_NAp,Bf063_NAp_transkrypcja_clean.txt,Bf063_NAp_clean.txt
3,Bf068_NAp,Bf068_NAp_transkrypcja_clean.txt,Bf068_NAp_clean.txt
4,Bf069_NAp,Bf069_NAp_transkrypcja_clean.txt,Bf069_NAp_clean.txt
...,...,...,...
88,Mm800_NAp,Mm800_NAp_transkrypcja_clean.txt,Mm800_NAp_clean.txt
89,Mm801_NAp,Mm801_NAp_transkrypcja_clean.txt,Mm801_NAp_clean.txt
90,Mm802_NAp,Mm802_NAp_transkrypcja_clean.txt,Mm802_NAp_clean.txt
91,Mm803_NAp,Mm803_NAp_transkrypcja_clean.txt,Mm803_NAp_clean.txt


In [13]:
#adding string columns to the DataFrame
df['ref_string'] = df['ref_file'].apply(lambda x: (reference_path / x).read_text(encoding='utf-8'))
df['hyp_string'] = df['hyp_file'].apply(lambda x: (hypothesis_path / x).read_text(encoding='utf-8'))

In [14]:
df

Unnamed: 0,key,ref_file,hyp_file,ref_string,hyp_string
0,Bf012_NAp,Bf012_NAp_transkrypcja_clean.txt,Bf012_NAp_сlean.txt,*EXP:\tdobra Bf012.\n*EXP:\tjuż gotowa?\n*EXP:...,*PAR1:\tDobrze . \n*PAR0:\tJuż . \n*PAR1:\tGot...
1,Bf062_NAp,Bf062_NAp_transkrypcja_clean.txt,Bf062_NAp_clean.txt,"*EXP:\tdobrze.\n*EXP:\tja nazywam się Lucyna, ...",*PAR1:\tDobrze ja nazywam się Lucyna i ty . \n...
2,Bf063_NAp,Bf063_NAp_transkrypcja_clean.txt,Bf063_NAp_clean.txt,*EXP:\tja nazywam się Lucyna.\n*EXP:\tty nazyw...,*PAR1:\tJa nazywam się Lucyna i Ty nazywasz si...
3,Bf068_NAp,Bf068_NAp_transkrypcja_clean.txt,Bf068_NAp_clean.txt,*EXP:\tLiliana.\n*EXP:\ttwój numer to be ef ze...,"*PAR1:\tJa nazywam się Lucyna Jachacy , a Ty n..."
4,Bf069_NAp,Bf069_NAp_transkrypcja_clean.txt,Bf069_NAp_clean.txt,"*EXP:\tpopatrz, mam tutaj trzy koperty.\n*EXP:...","*PAR1:\tJa nazywam się Lucena Jahacy , a ty ? ..."
...,...,...,...,...,...
88,Mm800_NAp,Mm800_NAp_transkrypcja_clean.txt,Mm800_NAp_clean.txt,"*CHI:\tTen…\n*CHI:\tYy, w układanki?\n*EXP:\tY...",*PAR0:\tJak tam powie +... \n*PAR0:\tPoczekaj ...
89,Mm801_NAp,Mm801_NAp_transkrypcja_clean.txt,Mm801_NAp_clean.txt,*EXP:\tDobra.\n*EXP:\tDobra.\n*EXP:\tTy jesteś...,"*PAR0:\tDobra , ty jesteś Edward , MM801 . \n*..."
90,Mm802_NAp,Mm802_NAp_transkrypcja_clean.txt,Mm802_NAp_clean.txt,"*EXP:\tNie, to będzie tylko mikrofon, o.\n*EXP...","*PAR0:\tTak , i ty jesteś Mateusz , MM802 . \n..."
91,Mm803_NAp,Mm803_NAp_transkrypcja_clean.txt,Mm803_NAp_clean.txt,*CHI:\tDługopis?\n*EXP:\tTak.\n*EXP:\tTy jeste...,*PAR0:\tKamera i mikrofon . \n*PAR0:\tMam indy...


In [15]:
#applying the transformation pipeline to the strings in the DataFrame
df['ref_transformed'] = df['ref_string'].apply(lambda x: " ".join(sum(transforms(x), [])))
df['hyp_transformed'] = df['hyp_string'].apply(lambda x: " ".join(sum(transforms(x), [])))


In [16]:
df

Unnamed: 0,key,ref_file,hyp_file,ref_string,hyp_string,ref_transformed,hyp_transformed
0,Bf012_NAp,Bf012_NAp_transkrypcja_clean.txt,Bf012_NAp_сlean.txt,*EXP:\tdobra Bf012.\n*EXP:\tjuż gotowa?\n*EXP:...,*PAR1:\tDobrze . \n*PAR0:\tJuż . \n*PAR1:\tGot...,exp\tdobra bf012\nexp\tjuż gotowa\nexp\tto jes...,par1\tdobrze par0\tjuż par1\tgotowa par1\tdobr...
1,Bf062_NAp,Bf062_NAp_transkrypcja_clean.txt,Bf062_NAp_clean.txt,"*EXP:\tdobrze.\n*EXP:\tja nazywam się Lucyna, ...",*PAR1:\tDobrze ja nazywam się Lucyna i ty . \n...,exp\tdobrze\nexp\tja nazywam się lucyna ty naz...,par1\tdobrze ja nazywam się lucyna i ty par0\t...
2,Bf063_NAp,Bf063_NAp_transkrypcja_clean.txt,Bf063_NAp_clean.txt,*EXP:\tja nazywam się Lucyna.\n*EXP:\tty nazyw...,*PAR1:\tJa nazywam się Lucyna i Ty nazywasz si...,exp\tja nazywam się lucyna\nexp\tty nazywasz s...,par1\tja nazywam się lucyna i ty nazywasz się ...
3,Bf068_NAp,Bf068_NAp_transkrypcja_clean.txt,Bf068_NAp_clean.txt,*EXP:\tLiliana.\n*EXP:\ttwój numer to be ef ze...,"*PAR1:\tJa nazywam się Lucyna Jachacy , a Ty n...",exp\tliliana\nexp\ttwój numer to be ef zero sz...,par1\tja nazywam się lucyna jachacy a ty nazyw...
4,Bf069_NAp,Bf069_NAp_transkrypcja_clean.txt,Bf069_NAp_clean.txt,"*EXP:\tpopatrz, mam tutaj trzy koperty.\n*EXP:...","*PAR1:\tJa nazywam się Lucena Jahacy , a ty ? ...",exp\tpopatrz mam tutaj trzy koperty\nexp\tsą w...,par1\tja nazywam się lucena jahacy a ty par1\t...
...,...,...,...,...,...,...,...
88,Mm800_NAp,Mm800_NAp_transkrypcja_clean.txt,Mm800_NAp_clean.txt,"*CHI:\tTen…\n*CHI:\tYy, w układanki?\n*EXP:\tY...",*PAR0:\tJak tam powie +... \n*PAR0:\tPoczekaj ...,chi\tten\nchi\tyy w układanki\nexp\tyy\nchi\tc...,par0\tjak tam powie + par0\tpoczekaj par1\tbol...
89,Mm801_NAp,Mm801_NAp_transkrypcja_clean.txt,Mm801_NAp_clean.txt,*EXP:\tDobra.\n*EXP:\tDobra.\n*EXP:\tTy jesteś...,"*PAR0:\tDobra , ty jesteś Edward , MM801 . \n*...",exp\tdobra\nexp\tdobra\nexp\tty jesteś edward ...,par0\tdobra ty jesteś edward mm801 par0\tja pa...
90,Mm802_NAp,Mm802_NAp_transkrypcja_clean.txt,Mm802_NAp_clean.txt,"*EXP:\tNie, to będzie tylko mikrofon, o.\n*EXP...","*PAR0:\tTak , i ty jesteś Mateusz , MM802 . \n...",exp\tnie to będzie tylko mikrofon o\nexp\ttak\...,par0\ttak i ty jesteś mateusz mm802 par0\tdobr...
91,Mm803_NAp,Mm803_NAp_transkrypcja_clean.txt,Mm803_NAp_clean.txt,*CHI:\tDługopis?\n*EXP:\tTak.\n*EXP:\tTy jeste...,*PAR0:\tKamera i mikrofon . \n*PAR0:\tMam indy...,chi\tdługopis\nexp\ttak\nexp\tty jesteś paweł ...,par0\tkamera i mikrofon par0\tmam indyko pis p...


In [17]:
# Calculate WER and CER for each pair after transformation
results = []
for _, row in df.iterrows():
    with open(reference_path / row['ref_file'], encoding='utf-8') as f:
        ref_text = f.read()
    with open(hypothesis_path / row['hyp_file'], encoding='utf-8') as f:
        hyp_text = f.read()
    # Apply the transformation pipeline
    ref_transformed = " ".join(sum(transforms(ref_text), []))
    hyp_transformed = " ".join(sum(transforms(hyp_text), []))
    results.append({
        'key': row['key'],
        'ref_file': row['ref_file'],
        'hyp_file': row['hyp_file'],
        'WER': jiwer.wer(ref_transformed, hyp_transformed),
        'CER': jiwer.cer(ref_transformed, hyp_transformed),
        'WIL': jiwer.wil(ref_transformed, hyp_transformed),
        'MER': jiwer.mer(ref_transformed, hyp_transformed),
        'WIP': jiwer.wip(ref_transformed, hyp_transformed)
    })

results_df = pd.DataFrame(results)
print(results_df)

          key                          ref_file             hyp_file  \
0   Bf012_NAp  Bf012_NAp_transkrypcja_clean.txt  Bf012_NAp_сlean.txt   
1   Bf062_NAp  Bf062_NAp_transkrypcja_clean.txt  Bf062_NAp_clean.txt   
2   Bf063_NAp  Bf063_NAp_transkrypcja_clean.txt  Bf063_NAp_clean.txt   
3   Bf068_NAp  Bf068_NAp_transkrypcja_clean.txt  Bf068_NAp_clean.txt   
4   Bf069_NAp  Bf069_NAp_transkrypcja_clean.txt  Bf069_NAp_clean.txt   
..        ...                               ...                  ...   
88  Mm800_NAp  Mm800_NAp_transkrypcja_clean.txt  Mm800_NAp_clean.txt   
89  Mm801_NAp  Mm801_NAp_transkrypcja_clean.txt  Mm801_NAp_clean.txt   
90  Mm802_NAp  Mm802_NAp_transkrypcja_clean.txt  Mm802_NAp_clean.txt   
91  Mm803_NAp  Mm803_NAp_transkrypcja_clean.txt  Mm803_NAp_clean.txt   
92  Mm804_NAp  Mm804_NAp_transkrypcja_clean.txt  Mm804_NAp_clean.txt   

         WER       CER       WIL       MER       WIP  
0   0.558712  0.330821  0.628787  0.462745  0.371213  
1   0.646137  0.411369  0

In [18]:
# calculating mean metrics

mean_metrics = results_df[['WER', 'CER', 'WIL', 'MER', 'WIP']].mean().to_frame(name='Mean').reset_index()
mean_metrics.columns = ['Metric', 'Mean']
print(mean_metrics)
    

  Metric      Mean
0    WER  0.712054
1    CER  0.467787
2    WIL  0.696625
3    MER  0.527852
4    WIP  0.303375
