# d/Deaf & Hard of Hearing ASR study - Groundtruth Collection & Cleaning
### Anna Choi, sc2359@cornell.edu 

In [1]:
import pandas as pd
import os
import re
import json
import glob
from jiwer import wer
from whisper_normalizer.english import EnglishTextNormalizer

## 0. Convert audio files
Exist extensions other than .wav in normal hearing files.

linux command

`for i in *.[WAV|MP3]; do ffmpeg -i "$i" "${i%.*}.wav"; done`

## 1. Create `file_list.csv` and `groundtruth.csv`.

`groundtruth.csv` file contains columns `subject_id`, `filename`, `passage_id`, `groundtruth`.

Note: `S12NWCDR2.wav` wrong filename, `passage_id` should be changed to `NWS`. `S7IPEDR2.wav` is removed for not having the right reading passage. `NH4C16DR2.wav` and `NH4C17DR2.wav` should have each other's reading passage. `NH5C4DR2.wav` corrupted file. `NH2CGCDR1.wav` duplicate file. `N3DC3DR2.wav` duplicate file.

In [2]:
def convert_bytes(num):
    for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return num, unit
        num /= 1024.0

def get_file_size(file_path):
    try:
        file_size = os.path.getsize(file_path)
        file_size, unit = convert_bytes(file_size)
        return file_size if unit == 'MB' else 0
    except:
        return 0

def generate_file_list(root_directory):
    file_list = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".wav"):
                match = re.search(r'(S\d+|NH\d+)([A-Z]+(?:[0-9]*))(DR[12])\.wav', file)
                if match:
                    subject_id = match.group(1)
                    passage_id = match.group(2)
                    full_file_path = os.path.join(root, file)
                    file_size = get_file_size(full_file_path)
                    
                    if file == 'S12NWCDR2.wav':
                        passage_id = 'NWS'

                    file_list.append({
                        'subject_id': subject_id,
                        'filename': file,
                        'passage_id': passage_id,
                        'size': file_size
                    })
    return file_list

deaf_directory = '../../../deaf/'
hearing_directory = '../../../hearing/'

deaf_file_list = generate_file_list(deaf_directory)
hearing_file_list = generate_file_list(hearing_directory)

deaf_file_df = pd.DataFrame(deaf_file_list)
hearing_file_df = pd.DataFrame(hearing_file_list)

deaf_file_df = deaf_file_df[deaf_file_df['filename'] != 'S7IPEDR2.wav']
deaf_file_df.loc[deaf_file_df['filename'] == 'S12NWCDR2.wav', 'passage_id'] = 'NWS'
deaf_file_df = deaf_file_df[deaf_file_df['subject_id'] != 'S11']
hearing_file_df = hearing_file_df[hearing_file_df['filename'] != 'NH5C4DR2.wav']
hearing_file_df = hearing_file_df[hearing_file_df['filename'] != 'NH2CGCDR1.wav']
hearing_file_df = hearing_file_df[hearing_file_df['filename'] != 'NH3DC3DR2.wav']
hearing_file_df.loc[hearing_file_df['filename'] == 'NH4C16DR2.wav', 'passage_id'] = 'C17'
hearing_file_df.loc[hearing_file_df['filename'] == 'NH4C17DR2.wav', 'passage_id'] = 'C16'

deaf_file_df.to_csv('../../data/deaf_file_list.csv', encoding = 'utf-8-sig', index=False)
hearing_file_df.to_csv('../../data/hearing_file_list.csv', encoding = 'utf-8-sig', index=False)

deaf_file_count = len(deaf_file_df)
hearing_file_count = len(hearing_file_df)

print(f"There is a total of {deaf_file_count} deaf participant files.")
print(f"There is a total of {hearing_file_count} normal hearing files.")

There is a total of 586 deaf participant files.
There is a total of 318 normal hearing files.


In [3]:
def merge_and_save(file_list_path, passage_path, output_path):
    file_list_df = pd.read_csv(file_list_path)
    passage_df = pd.read_csv(passage_path)
    merged_df = pd.merge(file_list_df, passage_df[['passage_id', 'groundtruth']], on='passage_id', how='left')
    
    null_values = merged_df[merged_df['groundtruth'].isnull()]
    if not null_values.empty:
        print("Warning: Null values found in the 'groundtruth' column. Handling them...")
        merged_df['groundtruth'].fillna('', inplace=True)
    
    merged_df.to_csv(output_path, encoding = 'utf-8-sig', index=False)

deaf_file_list_path = '../../data/deaf_file_list.csv'
deaf_passage_path = '../../data/passage.csv' 
deaf_output_path = '../../data/deaf_groundtruth.csv'
merge_and_save(deaf_file_list_path, deaf_passage_path, deaf_output_path)

hearing_file_list_path = '../../data/hearing_file_list.csv'
hearing_passage_path = '../../data/passage.csv' 
hearing_output_path = '../../data/hearing_groundtruth.csv'
merge_and_save(hearing_file_list_path, hearing_passage_path, hearing_output_path)

## 2. Create `all.csv` file

`groundtruth_edited.csv` contains manual edits to the groundtruth especially regarding speech errors or individual variations.

`all.csv` files contain transcripts from Amazon Web Service (`AWS`), Microsoft Azure (`Azure`), OpenAI Whisper (`Whisper`), and Google Chirp (`GoogleChirp`).

In [4]:
def generate_final_df(prefix, merged_df):
    date_placeholder = '*'
    
    aws_json_path = glob.glob(f'../../data/ASR transcripts/{date_placeholder}_{prefix}_AWS_transcript.json')[0]
    azure_json_path = glob.glob(f'../../data/ASR transcripts/{date_placeholder}_{prefix}_Azure_transcript.json')[0]
    whisper_json_path = glob.glob(f'../../data/ASR transcripts/{date_placeholder}_{prefix}_Whisper_transcript.json')[0]
    google_chirp_json_path = glob.glob(f'../../data/ASR transcripts/{date_placeholder}_{prefix}_GoogleChirp_transcript.json')[0]

    with open(aws_json_path, 'r') as f:
        aws_dict = json.load(f)
    with open(azure_json_path, 'r') as f:
        azure_dict = json.load(f)
    with open(whisper_json_path, 'r') as f:
        whisper_dict = json.load(f)
    with open(google_chirp_json_path, 'r') as f:
        google_chirp_dict = json.load(f)
    
    aws_df = pd.DataFrame(aws_dict)
    azure_df = pd.DataFrame(azure_dict)
    whisper_df = pd.DataFrame(whisper_dict)
    google_chirp_df = pd.DataFrame(google_chirp_dict)

    aws_df.rename(columns={'aws_transcription': 'AWS'}, inplace=True)
    azure_df.rename(columns={'transcript': 'Azure'}, inplace=True)
    whisper_df.rename(columns={'transcript': 'Whisper'}, inplace=True)
    google_chirp_df.rename(columns={'transcript': 'GoogleChirp'}, inplace=True)

    final_df = merged_df.copy()
    final_df = pd.merge(final_df, aws_df, left_on='filename', right_on='segment_name', how='left').drop('segment_name', axis=1)
    final_df = pd.merge(final_df, azure_df, on='filename', how='left')
    final_df = pd.merge(final_df, whisper_df, on='filename', how='left')
    final_df = pd.merge(final_df, google_chirp_df, on='filename', how='left')

    final_df.to_csv(f'../../data/{prefix}_all.csv', encoding = 'utf-8-sig', index=False)

merged_df_deaf = pd.read_csv('../../data/deaf_groundtruth_edited.csv')
generate_final_df('deaf', merged_df_deaf)

merged_df_hearing = pd.read_csv('../../data/hearing_groundtruth_edited.csv')
generate_final_df('hearing', merged_df_hearing)

In [5]:
deaf_df = pd.read_csv("../../data/deaf_all.csv", encoding = 'utf-8-sig')
hearing_df = pd.read_csv("../../data/hearing_all.csv", encoding = 'utf-8-sig')

unique_subjects_deaf = deaf_df['subject_id'].nunique()

unique_subjects_hearing = hearing_df['subject_id'].nunique()

print(f"Unique subjects in deaf dataset: {unique_subjects_deaf}")
print(f"Unique subjects in hearing dataset: {unique_subjects_hearing}")


Unique subjects in deaf dataset: 25
Unique subjects in hearing dataset: 9


## 3. Create `all_calc.csv` file
`all_calc.csv` contains the columns `{API}_WER` where the WER for each API is calculated.
The same cleaning is done for all of groundtruth and the four APIs.

In [6]:
english_normalizer = EnglishTextNormalizer()

filler_words = ['um', 'umm','uh', 'mhm', 'mm', 'ugh','uhhuh','mm-hmm',"uhh","mmhmm","uh-huh","uh-hmm","uh-hm","uh-hm","hm","hmm","emmm"]

def general_clean(text):
    resultwords  = [word.lower() for word in text.split() if re.sub('\,','',word.lower()) not in filler_words] 
    result = ' '.join(resultwords) 
    result = re.sub(r'[^\w\s]|_', ' ',result) 
    result = re.sub("\s+"," ",''.join(result))
    return result

def apply_contraction(text, replacements):
    split_words = text.split()
    for i in range(len(split_words)):
        if split_words[i].lower() in replacements.keys():
            split_words[i] = replacements[split_words[i].lower()]
    text = ' '.join(split_words)   
    return text

def apply_replacements(text, replacements):
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    return text

def apply_simple_replacements(text, replacements):
    for original, replacement in replacements.items():
        text = text.replace(original, replacement)
    return text

def remove_spaces_between_numbers(text):
    return re.sub(r'(\d{1,2})\s+(\d{1,2})', r'\1\2', text)

def clean_text(text, passage_id):
    if pd.isnull(text):
        return ''
    
    contraction_replacements = {
        "gonna":"going to",
        "wanna":"want to",
        "cuz":"cause",
        "ok":"okay",
        "asu":"a s u",
        "otpt":"o t p t",
        "tv":"t v",
        "phd":"p h d",
        "ph.d":"p h d",
        "ot":"o t",
        "pt":"p t",
        "er":"e r",
        "t-e-s-t":"t e s t",
        "function":"func tion",
        "growin":"growing",
        "ft":"feet",
        "dc":"d c",
        "unc":"u n c",
        "kilometers":"km",
        "hafta":"have to",
        "useta":"used to",
        "bout":"about",
        "windowsill":"window sill",
        "heartbeat":"heart beat"
    }

    general_replacements = {
        r'\bhol\b': 'hall',
        r'\b(holis|hollice)\b': 'hollis',
        r'\banne\b': 'ann',
        r'\b(wit|witt|whitt)\b': 'whit',
        r'\bstatt\b': 'stat',
        r'\bstatts\b': 'stats',
        r'\bblares\b': 'blairs',
        r'\b(davy|davey|davie)\b': 'david',
        r'\bcrocket\b': 'crockett',
        r'\bax\b': 'axe',
        r'\b(kama|koma)\b': 'comma',
        r'\bchatachuchi\b': 'chattahoochee',
        r'\b(techumish|tekimish|tukamush|tekimish|tookamush)\b': 'tecumseh',
        r'\b(under brush|under rush|underrushed)\b': 'underbrush'
    }
    passage_specific_replacements = {
        'C1': {
            r'\b(sh sh sh|sh h|sh sh|sh)\b': '',
            r'\b(libs|lips)\b': 'lids',
        },
        'CGC': {r'\b(ms|miss)\b': 'missus'},
        'DC8': {r'\b(grim stools|grimstalls)\b': 'grimstills'},
        'C5': {r'\b(e r oop|ar up|oop|whoop|a whoop|a rope|air up|arup|roop|roof|hoop|erupt|erp)\b': 'oof'}
    }
    
    simple_replacements = {
        ' ve ': ' have ',
        "getup":"get up",
        'shame faced': 'shamefaced',
        'plot': 'plop',
        'sun glow': 'sunglow',
        'billygoat': 'billy goat',
        'bob white': 'bobwhite',
        'hail storm': 'hailstorm',
        'sun bonnet': 'sunbonnet',
        'crisscross': 'criss cross',
        'mole hill': 'molehill',
        'wheat field': 'wheatfield',
        'kitt e e': 'kitty',
        'fella': 'fellow',
        'look out': 'lookout',
        'anymore': 'any more',
        'nightwatch': 'night watch',
        'clothesline': 'clothes line',
        'gol lee': 'golly',
        'dog fights': 'dogfights',
        'cock fights': 'cockfights',
        'any time': 'anytime',
        'grist mills': 'gristmills',
        'grist mill': 'gristmill',
        'superimposition': 'super imposition',
        'cold blooded': 'coldblooded',
    }
    
    text = text.replace("'"," ")
    text = text.replace('one third', '1/3').replace('one-third', '1/3').replace("one. Third", "1/3")
    text = english_normalizer(text)
    text = general_clean(text)
    text = apply_contraction(text, contraction_replacements)
    text = apply_simple_replacements(text, simple_replacements)
    text = apply_replacements(text, general_replacements)
    if passage_id in passage_specific_replacements:
        text = apply_replacements(text, passage_specific_replacements[passage_id])
    text = remove_spaces_between_numbers(text)
    if passage_id == 'DC9':
        text = text.replace('100 and 7', '107').replace('18111812', '1811 1812')
    if passage_id == 'C14':
        text = re.sub(r'\b150\b', '15 0', text)
    text = text.replace('318', '3 18')
    text = re.sub(r'\b3 18\b', '3 18th', text)
    text = text.replace('one 24', '124').replace('one 39', '139')
    text = text.replace('1232', '12 32')
    text = text.replace('124139', '124 139')
    text = text.replace('111811', '11 1811')
    text = text.replace('181812', '18 1812')
    
    text = re.sub(r'\s+', ' ', text)
    return text

def clean_final_df(final_df, asr_columns=['groundtruth', 'AWS', 'Azure', 'Whisper', 'GoogleChirp']):
    for col in asr_columns:
        clean_col_name = f"{col}_clean"
        final_df[clean_col_name] = final_df.apply(lambda row: clean_text(row[col], row['passage_id']), axis=1)
    return final_df

In [7]:
def calculate_wer_jiwer(groundtruth, hypothesis):
    if pd.isna(hypothesis):
        return 1.0
    if hypothesis.strip() == '':
        return 1.0
    return wer(groundtruth, hypothesis)

def calculate_wer_for_df(final_df, asr_columns=['AWS_clean', 'Azure_clean', 'Whisper_clean', 'GoogleChirp_clean']):
    for col in asr_columns:
        asr_provider = col.split('_')[0]
        wer_col_name = f"{asr_provider}_WER"
        final_df[wer_col_name] = final_df.apply(lambda row: calculate_wer_jiwer(row['groundtruth_clean'], row[col]), axis=1)
    return final_df

final_df_deaf = pd.read_csv('../../data/deaf_all.csv', encoding = 'utf-8-sig')
clean_df_deaf = clean_final_df(final_df_deaf)
wer_df_deaf = calculate_wer_for_df(clean_df_deaf)
wer_df_deaf.to_csv('../../data/deaf_all_calc.csv', encoding = 'utf-8-sig', index=False)

final_df_hearing = pd.read_csv('../../data/hearing_all.csv', encoding = 'utf-8-sig')
clean_df_hearing = clean_final_df(final_df_hearing)
wer_df_hearing = calculate_wer_for_df(clean_df_hearing)
wer_df_hearing.to_csv('../../data/hearing_all_calc.csv', encoding = 'utf-8-sig', index=False)

In [8]:
demographics_df = pd.read_csv('../../data/demographics.csv')

def clean_age_onset(age_of_onset):
    if age_of_onset in ["27 month", "2years"]:
        return 2
    elif age_of_onset in ["at birth", "4 month"]:
        return 0
    elif age_of_onset == "5 years":
        return 5
    elif age_of_onset == "7 years":
        return 7
    else:
        return age_of_onset

if 'Group' not in demographics_df.columns:
    demographics_df['Group'] = None
    
demographics_df['age_onset_clean'] = demographics_df['age_of_onset'].apply(clean_age_onset)
merged_df_deaf = pd.merge(demographics_df, wer_df_deaf, on='subject_id', how='inner')
merged_df_hearing = pd.merge(demographics_df, wer_df_hearing, on='subject_id', how='inner')
merged_all = pd.concat([merged_df_deaf, merged_df_hearing], ignore_index=True)
merged_all.fillna('NA', inplace=True)
merged_all['num_words'] = merged_all['groundtruth_clean'].apply(lambda x: len(str(x).split()))
column_order = [
    'subject_id', 'filename', 'passage_id', 'size', 'groundtruth', 'AWS', 'Azure', 'Whisper', 'GoogleChirp',
    'groundtruth_clean', 'AWS_clean', 'Azure_clean', 'Whisper_clean', 'GoogleChirp_clean',
    'AWS_WER', 'Azure_WER', 'Whisper_WER', 'GoogleChirp_WER', 'Group', 'age', 'gender', 'age_of_onset',
    'right_amplification_start', 'left_amplification_start', 'right_amplification_type', 'left_amplification_type',
    'right_amplification_model', 'left_amplification_model', 'speech_intelligibility', 'onset_hearing_loss',
    'communication_mode', 'age_onset_clean', 'num_words'
]

merged_all = merged_all[column_order]

merged_all.to_csv('../../data/SPAL_data.csv', encoding = 'utf-8-sig', index=False)