# Recognizer vs MFA Alignment Scoring
This notebook compares phonetic recognizer outputs to MFA-aligned results using edit distance, soft-DTW cost, and formant validation.

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
import json


In [None]:
def levenshtein_distance(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    m, n = len(s1), len(s2)
    dp = np.zeros((m+1, n+1), dtype=int)
    for i in range(m+1):
        for j in range(n+1):
            if i == 0: dp[i][j] = j
            elif j == 0: dp[i][j] = i
            elif s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    return dp[m][n]


In [None]:
def soft_dtw_cost(audio_frames, ipa_embeddings):
    T, D = audio_frames.shape
    L, _ = ipa_embeddings.shape
    cost_matrix = np.zeros((T, L))
    for t in range(T):
        for l in range(L):
            cost_matrix[t, l] = np.linalg.norm(audio_frames[t] - ipa_embeddings[l])
    return np.mean(np.min(cost_matrix, axis=0))  # Crude approximation


In [None]:
def compare_vowel_formants(f1, f2, recognizer_ref, mfa_ref):
    return {
        "target_match": euclidean((f1, f2), recognizer_ref),
        "mfa_match": euclidean((f1, f2), mfa_ref)
    }


In [None]:
def score_alignment(word, recognizer_ipa, mfa_ipa, f1, f2, 
                    recognizer_ref_formants, mfa_ref_formants,
                    audio_frames, ipa_embeddings):
    edit_distance = levenshtein_distance(recognizer_ipa, mfa_ipa)
    dtw_cost = soft_dtw_cost(audio_frames, ipa_embeddings)
    formant_result = compare_vowel_formants(f1, f2, recognizer_ref_formants, mfa_ref_formants)
    flag = "confident"
    if formant_result["mfa_match"] < formant_result["target_match"]:
        flag = "discrepant_vowel"
    elif edit_distance > 2:
        flag = "high_edit_distance"
    return {
        "word": word,
        "recognizer_ipa": recognizer_ipa,
        "mfa_ipa": mfa_ipa,
        "edit_distance": int(edit_distance),
        "soft_dtw_cost": float(dtw_cost),
        "formant_agreement": formant_result,
        "flag": flag
    }


In [None]:
def build_scoring_dataframe(words, recognizer_ipas, mfa_ipas, formant_pairs,
                            recognizer_refs, mfa_refs, audio_frame_list, ipa_embedding_list):
    records = []
    for i in range(len(words)):
        record = score_alignment(
            word=words[i],
            recognizer_ipa=recognizer_ipas[i],
            mfa_ipa=mfa_ipas[i],
            f1=formant_pairs[i][0],
            f2=formant_pairs[i][1],
            recognizer_ref_formants=recognizer_refs[i],
            mfa_ref_formants=mfa_refs[i],
            audio_frames=audio_frame_list[i],
            ipa_embeddings=ipa_embedding_list[i]
        )
        records.append(record)
    return pd.DataFrame.from_records(records)


In [None]:
# Example mock data for testing
words = ["elephant", "banana", "zebra"]
recognizer_ipas = ["ɛ l ə f ə n t", "b ə n æ n ə", "z i b ɹ ə"]
mfa_ipas = ["ɛ l ə f n̩ t", "b ə n ə n ə", "z ɛ b ɹ ə"]
formant_pairs = [(400, 1800), (500, 1600), (450, 1700)]
recognizer_refs = [(600, 1900), (550, 1500), (480, 1600)]
mfa_refs = [(400, 1800), (500, 1600), (450, 1700)]
audio_frame_list = [np.random.rand(60, 16) for _ in range(3)]
ipa_embedding_list = [np.random.rand(7, 16) for _ in range(3)]

df = build_scoring_dataframe(words, recognizer_ipas, mfa_ipas, formant_pairs,
                             recognizer_refs, mfa_refs, audio_frame_list, ipa_embedding_list)
df


In [None]:
# Export to JSON
output_path = "recognizer_mfa_scoring_output.json"
df.to_json(output_path, orient="records", indent=2)
print(f"Saved to {output_path}")
