In [20]:
import pandas as pd
import re
from jiwer import wer
from jiwer import compute_measures
import ast
import Levenshtein
import numpy as np
import ast

# normalization library
import unicodedata
import contractions
from num2words import num2words


## Load Results

In [21]:
phi4_whisper_result = pd.read_csv('results/whisper_phi4_asr_results_all.csv')

nvidia_parakeet_results = pd.read_csv('results/nvidia_parakeet_asr_results.csv')
# aisha_results_2 = pd.read_csv('results/us_medical_20_asr - us_medical_20_asr.csv')

# # concat the two results
# aisha_granite_parakeet = pd.concat([ashisa_results_1, aisha_results_2], ignore_index=True, sort=False)

evaluate_data = pd.merge(phi4_whisper_result, nvidia_parakeet_results[['utterance_id', 'Nvidia-Parakeet-ASR']], on='utterance_id', how='inner')
# evaluate_data = phi4_whisper_result
# evaluate_data['Phi-4-ASR'] = evaluate_data['Phi-4-ASR'].apply(lambda x: pd.NA if isinstance(x, str) and "ERROR: CUDA out of memory" in x else x)
## Rename 'transcript' to human transcript
evaluate_data = evaluate_data.rename(columns={'transcript': 'human-transcript'})

# # Merge Primock results
# primock_merged = pd.merge(primock_phi_whisper, primock_granite_parakeet[['utterance_id', 'Nvidia-Parakeet-ASR-doctor', 'Nvidia-Parakeet-ASR-patient', 'IBM-Granite-doctor', 'IBM-Granite-patient']], on='utterance_id', how='inner')

## Text Normalize

In [22]:
def remove_timestamps(text: str) -> str:
    """
    Remove timestamps matching patterns like:
      - 03:18:98
      - 00:00:001
    (pattern: 1-2 digits ':' 1-2 digits ':' 1-3 digits)

    Returns cleaned string with extra spaces collapsed.
    """
    if not isinstance(text, str):
        return text
    # remove timestamp tokens
    cleaned = re.sub(r'\b\d{1,2}:\d{1,2}:\d{1,3}\b', '', text)
    # remove new line characters (convert to spaces), then collapse multiple spaces and trim
    cleaned = cleaned.replace('\n', ' ').replace('\r', ' ')
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # remove speaker tags like: [Speaker 1]:  Speaker 1:  speaker1:  D:  P:
    cleaned = re.sub(r'\[?[Ss]peaker\s*\d+\]?:', '', cleaned)
    cleaned = re.sub(r'\b[Dd]:', '', cleaned)
    cleaned = re.sub(r'\b[Pp]:', '', cleaned)
    cleaned = re.sub(r'\bDOCTOR:\s*', '', cleaned)
    cleaned = re.sub(r'\bPATIENT:\s*', '', cleaned)
    
    # remove tags such <INAUDIBLE_SPEECH/>, <UNSURE> <UNSURE/>,  <UNIN/> etc
    cleaned = re.sub(r'<[^>]+>', '', cleaned)
    
    # expand contractions
    cleaned = contractions.fix(cleaned)
    
    # convert numbers to words using num2words
    cleaned = re.sub(r'\d+', lambda x: num2words(int(x.group())), cleaned)
    
    # # remove all characters that is not A-Z, a-z, 0-9 or space
    cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned)
    
    # lowercase the text
    cleaned = cleaned.lower()
    
    
    cleaned = re.sub(r'\bok\b', 'okay', cleaned)
    cleaned = re.sub(r'\bohh\b', 'oh', cleaned)
    cleaned = re.sub(r'\bdr\b', 'doctor', cleaned)
    cleaned = re.sub(r'\bpt\b', 'patient', cleaned)
    
    # cleaned = re.sub(r'\bokay\b', 'ok', cleaned)
    # cleaned = re.sub(r'\boh\b', 'ooh', cleaned)
    # cleaned = re.sub(r'\bdoctor\b', 'dr', cleaned)
    # cleaned = re.sub(r'\bpatient\b', 'pt', cleaned)
    # remove extra spaces again after replacements
    # cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    cleaned = re.sub(r'\b(um|uh|erm|uhm|mmhmm|ah|umm)\b', '', cleaned, flags=re.IGNORECASE)
    
    return cleaned
  
  
# def plain_normalize(text: str) -> str:
#     """
#     Normalize text by:
#       - Expanding contractions
#       - Converting numbers to words
#       - Remove disfluencies and timestamps
#       - Modify common abbreviations (e.g., "Dr." to "Doctor", "Pt." to "Patient")
#     """
#     if not isinstance(text, str):
#         return text
    
#     # expand contractions
#     cleaned_text = contractions.fix(text)
    
#     # remove disfluencies
#     cleaned_text = re.sub(r'\b(um|uh|erm|uhm|mmhmm|ah)\b', '', cleaned_text, flags=re.IGNORECASE)
    
#     # common abbreviation replacements
#     cleaned_text = re.sub(r'\bDr\.\s*', 'Doctor ', cleaned_text)
#     cleaned_text = re.sub(r'\bPt\.\s*', 'Patient ', cleaned_text)
#     cleaned_text = re.sub(r'\bOk\.\s*', 'Okay ', cleaned_text)
#     cleaned_text = re.sub(r'\bOhh\.\s*', 'Oh ', cleaned_text)
    
#     return cleaned_text


In [23]:
# parse Turn columns to extract Doctor and Patient utterances
def extract_utterances(turns):
    """Extract and format utterances from turns data."""
    if isinstance(turns, str):
        try:
            turns = ast.literal_eval(turns)
        except (ValueError, SyntaxError):
            return ''
    if not turns or not hasattr(turns, '__iter__'):
        return ''
    
    formatted_transcript = []
    
    for turn in turns:
        # Ensure turn is a dictionary
        if not isinstance(turn, dict):
            continue
            
        speaker = turn.get('speaker', '').upper()
        utterance = turn.get('text', '')
        
        # Format as "SPEAKER: utterance"
        if 'DOCTOR' in speaker:
            formatted_transcript.append(f"DOCTOR: {utterance}")
        elif 'PATIENT' in speaker:
            formatted_transcript.append(f"PATIENT: {utterance}")
    
    return ' '.join(formatted_transcript)

# Apply only to UK-Dataset rows
uk_mask = evaluate_data['source'] == 'UK-Dataset'
evaluate_data.loc[uk_mask, 'human-transcript'] = (evaluate_data.loc[uk_mask, 'human-transcript'].apply(extract_utterances))

In [24]:
# apply remove_timestamps to the human transcript column
evaluate_data['norm_human_transcript'] = evaluate_data['human-transcript'].apply(remove_timestamps)
evaluate_data['norm_whisper_asr'] = evaluate_data['Whisper-ASR'].apply(remove_timestamps)
evaluate_data['norm_phi4_asr'] = evaluate_data['Phi-4-ASR'].apply(remove_timestamps)
evaluate_data['norm_parakeet_asr'] = evaluate_data['Nvidia-Parakeet-ASR'].apply(remove_timestamps)
# evaluate_data['norm_granite'] = evaluate_data['IBM-Granite'].apply(remove_timestamps)

## Calculate WER

In [25]:
norm_columns = ['norm_whisper_asr', 'norm_phi4_asr', 'norm_parakeet_asr'] #'norm_parakeet_asr', 'norm_granite'
# Calculate WER for each ASR column
for norm_col in norm_columns:
    evaluate_data[f'{norm_col}_wer_compute'] = evaluate_data.apply(
        lambda row: compute_measures(
            row['norm_human_transcript'] if pd.notna(row['norm_human_transcript']) else "",
            row[norm_col] if pd.notna(row[norm_col]) else ""
        ), axis=1
    )
    evaluate_data[f'{norm_col}_wer'] = evaluate_data[f'{norm_col}_wer_compute'].apply(lambda measures: measures['wer'])
    evaluate_data[f'{norm_col}_ins'] = evaluate_data[f'{norm_col}_wer_compute'].apply(
        lambda measures: measures['insertions']
    )
    evaluate_data[f'{norm_col}_del'] = evaluate_data[f'{norm_col}_wer_compute'].apply(
        lambda measures: measures['deletions']
    )
    evaluate_data[f'{norm_col}_sub'] = evaluate_data[f'{norm_col}_wer_compute'].apply(
        lambda measures: measures['substitutions']
    )
    evaluate_data[f'{norm_col}_ops'] = evaluate_data[f'{norm_col}_wer_compute'].apply(
        lambda measures: measures['ops']
    )
    

In [26]:
# # apply wer calculation to primock as well
# primock_norm_columns = ['norm_whisper_doctor', 'norm_whisper_patient', 'norm_phi4_doctor', 'norm_phi4_patient',
#                         'norm_granite_doctor', 'norm_granite_patient', 'norm_parakeet_doctor', 'norm_parakeet_patient']
# for norm_col in primock_norm_columns:
#     primock_merged[f'{norm_col}_wer_compute'] = primock_merged.apply(
#         lambda row: compute_measures(
#             row['norm_human_doctor'] if 'doctor' in norm_col else row['norm_human_patient'],
#             row[norm_col] if pd.notna(row[norm_col]) else ""
#         ), axis=1
#     )
#     primock_merged[f'{norm_col}_wer'] = primock_merged[f'{norm_col}_wer_compute'].apply(lambda measures: measures['wer'])
#     primock_merged[f'{norm_col}_ins'] = primock_merged[f'{norm_col}_wer_compute'].apply(
#         lambda measures: measures['insertions']
#     )
#     primock_merged[f'{norm_col}_del'] = primock_merged[f'{norm_col}_wer_compute'].apply(
#         lambda measures: measures['deletions']
#     )
#     primock_merged[f'{norm_col}_sub'] = primock_merged[f'{norm_col}_wer_compute'].apply(
#         lambda measures: measures['substitutions']
#     )
#     primock_merged[f'{norm_col}_ops'] = primock_merged[f'{norm_col}_wer_compute'].apply(
#         lambda measures: measures['ops']
#     )

## Add Alignment for Sub, Del and Ins Words

In [27]:
def preprocess_alignment_ops(alignment_ops):
    # If already a list/dict, return as-is (may be nested list)
    if isinstance(alignment_ops, (list, dict)):
        return alignment_ops
    if alignment_ops is None or (isinstance(alignment_ops, float) and pd.isna(alignment_ops)):
        return []
    s = str(alignment_ops)
    s = s.replace("AlignmentChunk(", "{").replace(")", "}").replace("type=", "'type':")\
         .replace("ref_start_idx=", "'ref_start_idx':").replace("ref_end_idx=", "'ref_end_idx':")\
         .replace("hyp_start_idx=", "'hyp_start_idx':").replace("hyp_end_idx=", "'hyp_end_idx':")
    if s.startswith("[[") and s.endswith("]]"):
        s = s[1:-1]
    return s

def _get_field(op, field, default=None):
    # support dict-like
    if isinstance(op, dict):
        return op.get(field, default)
    # support object with attribute (AlignmentChunk)
    if hasattr(op, field):
        return getattr(op, field, default)
    # support tuple/list with numeric positions (fallback not used here)
    return default

def extract_words_from_alignment(ref_text, hyp_text, alignment_ops):
    deletions, insertions, substitutions, equals = [], [], [], []
    ref_words = ref_text.split() if isinstance(ref_text, str) else []
    hyp_words = hyp_text.split() if isinstance(hyp_text, str) else []

    # if hypothesis empty -> all deleted
    if not hyp_words:
        return ref_words, [], [], []

    processed = preprocess_alignment_ops(alignment_ops)

    # produce a flat list of op entries regardless of input shape
    if isinstance(processed, list):
        # flatten one level (handles [[...]] case)
        flat = []
        for item in processed:
            if isinstance(item, list):
                flat.extend(item)
            else:
                flat.append(item)
        alignment_ops_list = flat
    elif isinstance(processed, dict):
        alignment_ops_list = [processed]
    else:
        if not processed:
            return deletions, insertions, equals, substitutions
        try:
            alignment_ops_list = ast.literal_eval(processed)
            # if parsed to nested list, flatten one level
            if isinstance(alignment_ops_list, list) and any(isinstance(x, list) for x in alignment_ops_list):
                flat = []
                for item in alignment_ops_list:
                    if isinstance(item, list):
                        flat.extend(item)
                    else:
                        flat.append(item)
                alignment_ops_list = flat
        except Exception as e:
            raise ValueError(f"Failed to parse alignment_ops for row: {e}")

    for op in alignment_ops_list:
        typ = _get_field(op, "type")
        if typ == "delete":
            s = _get_field(op, "ref_start_idx", 0)
            e = _get_field(op, "ref_end_idx", 0)
            deletions.extend(ref_words[s:e])
        elif typ == "equal":
            s = _get_field(op, "hyp_start_idx", 0)
            e = _get_field(op, "hyp_end_idx", 0)
            equals.extend(hyp_words[s:e])
        elif typ == "insert":
            s = _get_field(op, "hyp_start_idx", 0)
            e = _get_field(op, "hyp_end_idx", 0)
            insertions.extend(hyp_words[s:e])
        elif typ == "substitute":
            s = _get_field(op, "ref_start_idx", 0)
            e = _get_field(op, "ref_end_idx", 0)
            substitutions.extend(ref_words[s:e])

    return deletions, insertions, equals, substitutions

In [28]:
# apply extract_words_from_alignment to each row in the DataFrame i.e Whisper ASR results and Phi-4 ASR results 

# asr_rows = ['norm_whisper_asr', 'norm_phi4_asr', 'norm_parakeet_asr', 'norm_granite']
asr_rows = ['norm_whisper_asr', 'norm_phi4_asr', 'norm_parakeet_asr']
# align_ops_rows = ['norm_whisper_asr_ops', 'norm_phi4_asr_ops', 'norm_parakeet_ops', 'norm_granite_ops']
align_ops_rows = ['norm_whisper_asr_ops', 'norm_phi4_asr_ops', 'norm_parakeet_asr_ops']

for asr_col, ops_col in zip(asr_rows, align_ops_rows):
    all_deletions = []
    all_insertions = []
    all_equals = []
    all_substitutions = []
    error_messages = []

    for index, row in evaluate_data.iterrows():
        try:
            result = extract_words_from_alignment(row['norm_human_transcript'], row[asr_col], row[ops_col])
            all_deletions.append(result[0])
            all_insertions.append(result[1])
            all_equals.append(result[2])
            all_substitutions.append(result[3])
            error_messages.append("")
        except Exception as e:
            all_deletions.append([])
            all_insertions.append([])
            all_equals.append([])
            all_substitutions.append([])
            error_messages.append(str(e))
            print(f"Error processing row {index} for {asr_col}: {str(e)}")

    # Creating DataFrame for the extracted words and error messages
    extracted_words_df = pd.DataFrame({
        f'{asr_col}_Deletions': all_deletions,
        f'{asr_col}_Insertions': all_insertions,
        # f'{asr_col}_Equals': all_equals,
        f'{asr_col}_Substitutions': all_substitutions,
        # f'{asr_col}_Error_Message': error_messages
    })

    # Concatenate the extracted words DataFrame with the original DataFrame
    evaluate_data = pd.concat([evaluate_data, extracted_words_df], axis=1, join='outer')


In [29]:
# # apply alignment function to primock as well
# primock_asr_rows = ['norm_whisper_doctor', 'norm_whisper_patient', 'norm_phi4_doctor', 'norm_phi4_patient',
#                     'norm_parakeet_doctor', 'norm_parakeet_patient', 'norm_granite_doctor', 'norm_granite_patient']
# primock_align_ops_rows = ['norm_whisper_doctor_ops', 'norm_whisper_patient_ops', 'norm_phi4_doctor_ops', 'norm_phi4_patient_ops', 'norm_parakeet_doctor_ops', 'norm_parakeet_patient_ops', 'norm_granite_doctor_ops', 'norm_granite_patient_ops']
# for asr_col, ops_col in zip(primock_asr_rows, primock_align_ops_rows):
#     all_deletions = []
#     all_insertions = []
#     all_equals = []
#     all_substitutions = []
#     error_messages = []

#     for index, row in primock_merged.iterrows():
#         try:
#             result = extract_words_from_alignment(
#                 row['norm_human_doctor'] if 'doctor' in asr_col else row['norm_human_patient'],
#                 row[asr_col],
#                 row[ops_col]
#             )
#             all_deletions.append(result[0])
#             all_insertions.append(result[1])
#             all_equals.append(result[2])
#             all_substitutions.append(result[3])
#             error_messages.append("")
#         except Exception as e:
#             all_deletions.append([])
#             all_insertions.append([])
#             all_equals.append([])
#             all_substitutions.append([])
#             error_messages.append(str(e))
#             print(f"Error processing row {index} for {asr_col}: {str(e)}")

#     # Creating DataFrame for the extracted words and error messages
#     extracted_words_df = pd.DataFrame({
#         f'{asr_col}_Deletions': all_deletions,
#         f'{asr_col}_Insertions': all_insertions,
#         # f'{asr_col}_Equals': all_equals,
#         f'{asr_col}_Substitutions': all_substitutions,
#         # f'{asr_col}_Error_Message': error_messages
#     })

#     # Concatenate the extracted words DataFrame with the original DataFrame
#     primock_merged = pd.concat([primock_merged, extracted_words_df], axis=1, join='outer')

## Reconstruct Human Transcript

In [30]:
def align_words(ref, hyp, ref_range):
    if pd.isna(ref) or pd.isna(hyp):
        return None
    
    ref = ref.split()
    hyp = hyp.split()
    lexicon = list(set(ref + hyp))
    word2digit = {word: chr(i) for i, word in enumerate(lexicon)}
    ref_uni = [word2digit[w] for w in ref]
    hyp_uni = [word2digit[w] for w in hyp]
    
    edit_ops = pd.DataFrame(Levenshtein.editops(''.join(ref_uni), ''.join(hyp_uni)), columns=['operation', 'ref_ix', 'hyp_ix'])
    aligned_ref, aligned_hyp = ref.copy(), hyp.copy()
    aligned_ops = ['='] * len(ref)
    aligned_ref_ix, aligned_hyp_ix = list(range(len(ref))), list(range(len(hyp)))
    ix_edit_ops = [np.NaN] * len(aligned_ref)

    ins_count, del_count = 0, 0
    for idx, ops in edit_ops.iterrows():
        if ops['operation'] == 'insert':
            aligned_ref.insert(ins_count + ops['ref_ix'], '_')
            aligned_ops.insert(ins_count + ops['ref_ix'], 'ins')
            aligned_ref_ix.insert(ins_count + ops['ref_ix'], None)
            ix_edit_ops.insert(ins_count + ops['ref_ix'], idx)
            ins_count += 1
        elif ops['operation'] == 'delete':
            aligned_hyp.insert(del_count + ops['hyp_ix'], '_')
            aligned_ops[ins_count + ops['ref_ix']] = 'del'
            aligned_hyp_ix.insert(del_count + ops['hyp_ix'], None)
            ix_edit_ops[ins_count + ops['ref_ix']] = idx
            del_count += 1
        elif ops['operation'] == 'replace':
            aligned_ops[ins_count + ops['ref_ix']] = 'sub'
            ix_edit_ops[ins_count + ops['ref_ix']] = idx

    aligned_df = pd.DataFrame({
        'ref_ix': aligned_ref_ix,
        'hyp_ix': aligned_hyp_ix,
        'reference': aligned_ref,
        'hypothesis': aligned_hyp,
        'operation': aligned_ops,
        'index_edit_ops': ix_edit_ops
    }).astype({'ref_ix': 'Int32', 'hyp_ix': 'Int32', 'index_edit_ops': 'Int32'})


    return aligned_df
# apply align_words to whisper_norm_asr and phi4_norm_asr
evaluate_data['whisper_aligned_df'] = evaluate_data.apply(
    lambda row: align_words(row['norm_human_transcript'], row['norm_whisper_asr'], None), axis=1
)
evaluate_data['phi4_aligned_df'] = evaluate_data.apply(
    lambda row: align_words(row['norm_human_transcript'], row['norm_phi4_asr'], None), axis=1
)
evaluate_data['parakeet_aligned_df'] = evaluate_data.apply(
    lambda row: align_words(row['norm_human_transcript'], row['norm_parakeet_asr'], None), axis=1
)
# evaluate_data['granite_aligned_df'] = evaluate_data.apply(
#     lambda row: align_words(row['norm_human_transcript'], row['norm_granite'], None), axis=1
# )

In [31]:
# use output of aligned words to analyze equal, deletions, insertions, substitutions and reconstruct the reference text while marking the errors

def reconstruct_reference_with_errors(aligned_df):
    # handle None, non-DataFrame, and empty
    if aligned_df is None or not isinstance(aligned_df, pd.DataFrame) or aligned_df.empty:
        return ""
    
    reconstructed = []
    for _, row in aligned_df.iterrows():
        op = row.get('operation')
        ref_word = row.get('reference', '')
        hyp_word = row.get('hypothesis', '')
        if op == '=':
            reconstructed.append(ref_word)
        elif op == 'ins':
            # show the inserted hypothesis word instead of '_' placeholder
            reconstructed.append(f"[INS:{hyp_word}]")
        elif op == 'del':
            reconstructed.append(f"[DEL:{ref_word}]")
        elif op == 'sub':
            reconstructed.append(f"[SUB:{ref_word}->{hyp_word}]")
    return ' '.join(w for w in reconstructed if isinstance(w, str))

# apply to all aligned dfs
evaluate_data['whisper_reconstructed_ref'] = evaluate_data['whisper_aligned_df'].apply(reconstruct_reference_with_errors)
evaluate_data['phi4_reconstructed_ref'] = evaluate_data['phi4_aligned_df'].apply(reconstruct_reference_with_errors)
evaluate_data['parakeet_reconstructed_ref'] = evaluate_data['parakeet_aligned_df'].apply(reconstruct_reference_with_errors)
# evaluate_data['granite_reconstructed_ref'] = evaluate_data['granite_aligned_df'].apply(reconstruct_reference_with_errors)



In [32]:
# ## Reconstruct human transcript based on ASR outputs and alignment ops; color-code the errors encountered
# def reconstruct_human_transcript(hyp_text, alignment_ops):
#     reconstructed = []
#     hyp_words = hyp_text.split() if isinstance(hyp_text, str) else []

#     # if hypothesis empty -> return empty
#     if not hyp_words:
#         return ""

#     processed = preprocess_alignment_ops(alignment_ops)

#     # produce a flat list of op entries regardless of input shape
#     if isinstance(processed, list):
#         # flatten one level (handles [[...]] case)
#         flat = []
#         for item in processed:
#             if isinstance(item, list):
#                 flat.extend(item)
#             else:
#                 flat.append(item)
#         alignment_ops_list = flat
#     elif isinstance(processed, dict):
#         alignment_ops_list = [processed]
#     else:
#         if not processed:
#             return ""
#         try:
#             alignment_ops_list = ast.literal_eval(processed)
#             # if parsed to nested list, flatten one level
#             if isinstance(alignment_ops_list, list) and any(isinstance(x, list) for x in alignment_ops_list):
#                 flat = []
#                 for item in alignment_ops_list:
#                     if isinstance(item, list):
#                         flat.extend(item)
#                     else:
#                         flat.append(item)
#                 alignment_ops_list = flat
#         except Exception as e:
#             raise ValueError(f"Failed to parse alignment_ops: {e}")

#     for op in alignment_ops_list:
#         typ = _get_field(op, "type")
#         if typ == "delete":
#             s = _get_field(op, "ref_start_idx", 0)
#             e = _get_field(op, "ref_end_idx", 0)
#             deleted_words = f"[DEL: {' '.join(['__'+w+'__' for w in hyp_words[s:e]])}]"
#             reconstructed.append(deleted_words)
#         elif typ == "equal":
#             s = _get_field(op, "hyp_start_idx", 0)
#             e = _get_field(op, "hyp_end_idx", 0)
#             reconstructed.extend(hyp_words[s:e])
#         elif typ == "insert":
#             s = _get_field(op, "hyp_start_idx", 0)
#             e = _get_field(op, "hyp_end_idx", 0)
#             inserted_words = f"[INS: {' '.join(['++'+w+'++' for w in hyp_words[s:e]])}]"
#             reconstructed.append(inserted_words)
#         elif typ == "substitute":
#             s = _get_field(op, "ref_start_idx", 0)
            
#             e = _get_field(op, "ref_end_idx", 0)
#             substituted_words = f"[SUB: {' '.join(['~~'+w+'~~' for w in hyp_words[s:e]])}]"
#             reconstructed.append(substituted_words)
#     return ' '.join(reconstructed)

# # apply reconstruct_human_transcript to each ASR column
# for asr_col, ops_col in zip(asr_rows, align_ops_rows):
#     evaluate_data[f'{asr_col}_reconstructed_human'] = evaluate_data.apply(
#         lambda row: reconstruct_human_transcript(row['norm_human_transcript'], row[ops_col]), axis=1
#     )

# # # apply to whipser ASR only for now
# # evaluate_data['norm_whisper_asr_reconstructed_human'] = evaluate_data.apply(
# #     lambda row: reconstruct_human_transcript(row['norm_human_transcript'], row['norm_whisper_asr_ops']), axis=1
# # )    

In [33]:
# # reconstruct human transcript for primock as well
# primock_asr_rows = ['norm_whisper_doctor', 'norm_whisper_patient', 'norm_phi4_doctor', 'norm_phi4_patient',
#                     'norm_parakeet_doctor', 'norm_parakeet_patient', 'norm_granite_doctor', 'norm_granite_patient']
# primock_align_ops_rows = ['norm_whisper_doctor_ops', 'norm_whisper_patient_ops', 'norm_phi4_doctor_ops', 'norm_phi4_patient_ops', 'norm_parakeet_doctor_ops', 'norm_parakeet_patient_ops', 'norm_granite_doctor_ops', 'norm_granite_patient_ops']
# for asr_col, ops_col in zip(primock_asr_rows, primock_align_ops_rows):
#     primock_merged[f'{asr_col}_reconstructed_human'] = primock_merged.apply(
#         lambda row: reconstruct_human_transcript(
#             row['norm_human_doctor'] if 'doctor' in asr_col else row['norm_human_patient'],
#             row[ops_col]
#         ), axis=1
#     )

In [34]:
# # apply align_words to primock as well
# primock_merged['whisper_doctor_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_doctor'], row['norm_whisper_doctor'], None), axis=1
# )
# primock_merged['whisper_patient_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_patient'], row['norm_whisper_patient'], None), axis=1
# )
# primock_merged['phi4_doctor_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_doctor'], row['norm_phi4_doctor'], None), axis=1
# )
# primock_merged['phi4_patient_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_patient'], row['norm_phi4_patient'], None), axis=1
# )
# primock_merged['parakeet_doctor_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_doctor'], row['norm_parakeet_doctor'], None), axis=1
# )
# primock_merged['parakeet_patient_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_patient'], row['norm_parakeet_patient'], None), axis=1
# )
# primock_merged['granite_doctor_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_doctor'], row['norm_granite_doctor'], None), axis=1
# )
# primock_merged['granite_patient_aligned_df'] = primock_merged.apply(
#     lambda row: align_words(row['norm_human_patient'], row['norm_granite_patient'], None), axis=1
# )

# # apply reconstruct_reference_with_errors to all aligned dfs in primock
# primock_merged['whisper_doctor_reconstructed_ref'] = primock_merged['whisper_doctor_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['whisper_patient_reconstructed_ref'] = primock_merged['whisper_patient_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['phi4_doctor_reconstructed_ref'] = primock_merged['phi4_doctor_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['phi4_patient_reconstructed_ref'] = primock_merged['phi4_patient_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['parakeet_doctor_reconstructed_ref'] = primock_merged['parakeet_doctor_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['parakeet_patient_reconstructed_ref'] = primock_merged['parakeet_patient_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['granite_doctor_reconstructed_ref'] = primock_merged['granite_doctor_aligned_df'].apply(reconstruct_reference_with_errors)
# primock_merged['granite_patient_reconstructed_ref'] = primock_merged['granite_patient_aligned_df'].apply(reconstruct_reference_with_errors) 

## Save Files

In [35]:
# remove non-essential columns that ends with _asr_wer_compute, and _asr_ops
cols_to_drop = [col for col in evaluate_data.columns if col.endswith('_asr_wer_compute') or col.endswith('_asr_ops')]
evaluate_data.drop(columns=cols_to_drop, inplace=True)

# save the dataframe to a new excel file
evaluate_data.to_excel('results/all_result_processed_normalized.xlsx', index=False, engine='openpyxl')

In [36]:
# load the excel file and display the first few rows
df = pd.read_excel('results/all_result_processed_normalized.xlsx', engine='openpyxl')

# move each models' results to separate sheets in the excel file
with pd.ExcelWriter('results/all_result_separate_sheets_normalized.xlsx', engine='openpyxl') as writer:
    whisper_cols = ['utterance_id', 'source', 'duration_sec', 'human-transcript', 'Whisper-ASR', 'norm_human_transcript', 'norm_whisper_asr', 
                    'norm_whisper_asr_wer', 'norm_whisper_asr_ins', 'norm_whisper_asr_del', 
                    'norm_whisper_asr_sub', 'whisper_aligned_df',
                    'norm_whisper_asr_Deletions', 'norm_whisper_asr_Insertions', 'norm_whisper_asr_Substitutions', 'whisper_reconstructed_ref']
    phi4_cols = ['utterance_id', 'source', 'duration_sec', 'human-transcript', 'Phi-4-ASR', 'norm_human_transcript', 'norm_phi4_asr', 
                 'norm_phi4_asr_wer', 'norm_phi4_asr_ins', 'norm_phi4_asr_del', 
                 'norm_phi4_asr_sub', 'phi4_aligned_df',
                 'norm_phi4_asr_Deletions', 'norm_phi4_asr_Insertions', 'norm_phi4_asr_Substitutions', 'phi4_reconstructed_ref']
    parakeet_cols = ['utterance_id', 'source', 'duration_sec', 'human-transcript', 'Nvidia-Parakeet-ASR', 'norm_human_transcript', 'norm_parakeet_asr', 
                     'norm_parakeet_wer', 'norm_parakeet_ins', 'norm_parakeet_del', 
                     'norm_parakeet_sub', 'parakeet_aligned_df',
                     'norm_parakeet_Deletions', 'norm_parakeet_Insertions', 'norm_parakeet_Substitutions', 'parakeet_reconstructed_ref']
    # granite_cols = ['utterance_id', 'source', 'duration_sec', 'human-transcript', 'IBM-Granite', 'norm_human_transcript', 'norm_granite', 
    #                 'norm_granite_wer', 'norm_granite_ins', 'norm_granite_del', 
    #                 'norm_granite_sub', 'granite_aligned_df',
    #                 'norm_granite_Deletions', 'norm_granite_Insertions', 'norm_granite_Substitutions', 'granite_reconstructed_ref']
    df_whisper = df.filter(items=whisper_cols, axis=1)
    df_phi4 = df.filter(items=phi4_cols, axis=1)
    df_parakeet = df.filter(items=parakeet_cols, axis=1)
    # df_granite = df.filter(items=granite_cols, axis=1)

    # Only write non-empty DataFrames to avoid invisible sheet error
    if not df_whisper.empty:
        df_whisper.to_excel(writer, sheet_name='Whisper-ASR Results', index=False)
    if not df_phi4.empty:
        df_phi4.to_excel(writer, sheet_name='Phi-4-ASR Results', index=False)
    if not df_parakeet.empty:
        df_parakeet.to_excel(writer, sheet_name='Nvidia-Parakeet-ASR Results', index=False)
    # if not df_granite.empty:
    #     df_granite.to_excel(writer, sheet_name='IBM-Granite Results', index=False)
        
    # resize each row in the sheets to 120px
    for sheet_name in ['Whisper-ASR Results', 'Phi-4-ASR Results', 'Nvidia-Parakeet-ASR Results']: #'Nvidia-Parakeet-ASR Results', 'IBM-Granite Results'
        worksheet = writer.sheets.get(sheet_name)
        if worksheet:
            for row_idx in range(1, len(df) + 2):  # +2 to account for header row and 1-based indexing
                worksheet.row_dimensions[row_idx].height = 120


In [37]:
# # seperate primock results into separate sheets as well
# with pd.ExcelWriter('primock_result_separate_sheets.xlsx', engine='openpyxl') as writer:
#     whisper_cols = ['conversation_id', 'turns', 'doctor_utterances', 'patient_utterances', 'norm_human_doctor', 'norm_human_patient', 'norm_whisper_doctor', 
#                     'norm_whisper_patient', 'norm_whisper_doctor_wer', 'norm_whisper_doctor_ins', 'norm_whisper_doctor_del', 
#                     'norm_whisper_doctor_sub', 'whisper_doctor_aligned_df',
#                     'norm_whisper_doctor_Deletions', 'norm_whisper_doctor_Insertions', 'norm_whisper_doctor_Substitutions', 'whisper_doctor_reconstructed_ref',
#                     'norm_whisper_patient_wer', 'norm_whisper_patient_ins', 'norm_whisper_patient_del', 
#                     'norm_whisper_patient_sub', 'whisper_patient_aligned_df',
#                     'norm_whisper_patient_Deletions', 'norm_whisper_patient_Insertions', 'norm_whisper_patient_Substitutions', 'whisper_patient_reconstructed_ref']
#     phi4_cols = ['conversation_id', 'turns', 'doctor_utterances', 'patient_utterances', 'norm_human_doctor', 'norm_human_patient', 'norm_phi4_doctor', 
#                  'norm_phi4_patient', 'norm_phi4_doctor_wer', 'norm_phi4_doctor_ins', 'norm_phi4_doctor_del', 
#                  'norm_phi4_doctor_sub', 'phi4_doctor_aligned_df',
#                  'norm_phi4_doctor_Deletions', 'norm_phi4_doctor_Insertions', 'norm_phi4_doctor_Substitutions', 'phi4_doctor_reconstructed_ref',
#                  'norm_phi4_patient_wer', 'norm_phi4_patient_ins', 'norm_phi4_patient_del', 
#                  'norm_phi4_patient_sub', 'phi4_patient_aligned_df',
#                  'norm_phi4_patient_Deletions', 'norm_phi4_patient_Insertions', 'norm_phi4_patient_Substitutions', 'phi4_patient_reconstructed_ref']
#     parakeet_cols = ['conversation_id', 'turns', 'doctor_utterances', 'patient_utterances', 'norm_human_doctor', 'norm_human_patient', 'norm_parakeet_doctor', 
#                      'norm_parakeet_patient', 'norm_parakeet_doctor_wer', 'norm_parakeet_doctor_ins', 'norm_parakeet_doctor_del', 
#                      'norm_parakeet_doctor_sub', 'parakeet_doctor_aligned_df',
#                      'norm_parakeet_doctor_Deletions', 'norm_parakeet_doctor_Insertions', 'norm_parakeet_doctor_Substitutions', 'parakeet_doctor_reconstructed_ref',
#                      'norm_parakeet_patient_wer', 'norm_parakeet_patient_ins', 'norm_parakeet_patient_del', 
#                      'norm_parakeet_patient_sub', 'parakeet_patient_aligned_df',
#                      'norm_parakeet_patient_Deletions', 'norm_parakeet_patient_Insertions', 'norm_parakeet_patient_Substitutions', 'parakeet_patient_reconstructed_ref']
#     granite_cols = ['conversation_id', 'turns', 'doctor_utterances', 'patient_utterances', 'norm_human_doctor', 'norm_human_patient', 'norm_granite_doctor', 
#                     'norm_granite_patient', 'norm_granite_doctor_wer', 'norm_granite_doctor_ins', 'norm_granite_doctor_del', 
#                     'norm_granite_doctor_sub', 'granite_doctor_aligned_df',
#                     'norm_granite_doctor_Deletions', 'norm_granite_doctor_Insertions', 'norm_granite_doctor_Substitutions', 'granite_doctor_reconstructed_ref',
#                     'norm_granite_patient_wer', 'norm_granite_patient_ins', 'norm_granite_patient_del', 
#                     'norm_granite_patient_sub', 'granite_patient_aligned_df',
#                     'norm_granite_patient_Deletions', 'norm_granite_patient_Insertions', 'norm_granite_patient_Substitutions', 'granite_patient_reconstructed_ref']
#     df_whisper = primock_merged.filter(items=whisper_cols, axis=1)
#     df_phi4 = primock_merged.filter(items=phi4_cols, axis=1)
#     df_parakeet = primock_merged.filter(items=parakeet_cols, axis=1)
#     df_granite = primock_merged.filter(items=granite_cols, axis=1)
#     if not df_whisper.empty:
#         df_whisper.to_excel(writer, sheet_name='Whisper-ASR Results', index=False)
#     if not df_phi4.empty:
#         df_phi4.to_excel(writer, sheet_name='Phi-4-ASR Results', index=False)
#     if not df_parakeet.empty:
#         df_parakeet.to_excel(writer, sheet_name='Nvidia-Parakeet-ASR Results', index=False)
#     if not df_granite.empty:
#         df_granite.to_excel(writer, sheet_name='IBM-Granite Results', index=False)
    
# # resize each row in the sheets to 120px
#     for sheet_name in ['Whisper-ASR Results', 'Phi-4-ASR Results', 'Nvidia-Parakeet-ASR Results', 'IBM-Granite Results']:
#         worksheet = writer.sheets.get(sheet_name)
#         if worksheet:
#             for row_idx in range(1, len(primock_merged) + 2):  # +2 to account for header row and 1-based indexing
#                 worksheet.row_dimensions[row_idx].height = 120
    
    
                


In [39]:
# select three session with the following utterance ids: day4_consultation07, 1_Malaria, RES0073
selected_utterances = ['2_Diarrhea', '18_Pneumonia', '46aacf84-fdd1-490b-a857-633d2e7763a0_7d4de4c9d3488a4bbd35634cbd3a2b66_l1RjPEwA']
selected_data = evaluate_data[evaluate_data['utterance_id'].isin(selected_utterances)]

# make all columns lowercase for better readability
selected_data.columns = [col.lower() for col in selected_data.columns]

# rename "phi-4-asr" column to "phi4-asr" for better readability
selected_data.rename(columns={'phi-4-asr': 'phi4-asr'}, inplace=True)

# save the selected data to a new excel file
selected_data.to_excel('results/selected_sessions_normalized.xlsx', index=False, engine='openpyxl')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data.rename(columns={'phi-4-asr': 'phi4-asr'}, inplace=True)
