# 🧠 Recognizer vs MFA Alignment & Acoustic Scoring

This notebook performs detailed comparisons between phonetic recognizer outputs and Montreal Forced Aligner (MFA) results. It includes:

- Edit distance between IPA sequences
- Soft-DTW alignment cost (mock + optional CUDA support)
- Vowel formant validation
- Batch processing
- Visualization of results
- TextGrid generation


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
import json
from textgrid import TextGrid, IntervalTier
import os


In [None]:
try:
    from soft_dtw_cuda import SoftDTW
    use_soft_dtw_cuda = True
except ImportError:
    use_soft_dtw_cuda = False
    print("⚠️ soft_dtw_cuda not found. Falling back to mock DTW.")


In [None]:
def levenshtein_distance(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    m, n = len(s1), len(s2)
    dp = np.zeros((m+1, n+1), dtype=int)
    for i in range(m+1):
        for j in range(n+1):
            if i == 0: dp[i][j] = j
            elif j == 0: dp[i][j] = i
            elif s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    return dp[m][n]


In [None]:
def soft_dtw_cost(audio_frames, ipa_embeddings, gamma=0.1):
    if use_soft_dtw_cuda:
        import torch
        x = torch.tensor(audio_frames[None, :, :], dtype=torch.float32).cuda()
        y = torch.tensor(ipa_embeddings[None, :, :], dtype=torch.float32).cuda()
        loss_fn = SoftDTW(use_cuda=True, gamma=gamma)
        return float(loss_fn(x, y).item())
    else:
        T, D = audio_frames.shape
        L, _ = ipa_embeddings.shape
        cost_matrix = np.zeros((T, L))
        for t in range(T):
            for l in range(L):
                cost_matrix[t, l] = np.linalg.norm(audio_frames[t] - ipa_embeddings[l])
        return np.mean(np.min(cost_matrix, axis=0))


In [None]:
def compare_vowel_formants(f1, f2, recognizer_ref, mfa_ref):
    return {
        "target_match": euclidean((f1, f2), recognizer_ref),
        "mfa_match": euclidean((f1, f2), mfa_ref)
    }


In [None]:
def score_alignment(word, recognizer_ipa, mfa_ipa, f1, f2, 
                    recognizer_ref_formants, mfa_ref_formants,
                    audio_frames, ipa_embeddings):
    edit_distance = levenshtein_distance(recognizer_ipa, mfa_ipa)
    dtw_cost = soft_dtw_cost(audio_frames, ipa_embeddings)
    formant_result = compare_vowel_formants(f1, f2, recognizer_ref_formants, mfa_ref_formants)
    flag = "confident"
    if formant_result["mfa_match"] < formant_result["target_match"]:
        flag = "discrepant_vowel"
    elif edit_distance > 2:
        flag = "high_edit_distance"
    return {
        "word": word,
        "recognizer_ipa": recognizer_ipa,
        "mfa_ipa": mfa_ipa,
        "edit_distance": int(edit_distance),
        "soft_dtw_cost": float(dtw_cost),
        "formant_agreement": formant_result,
        "flag": flag
    }


In [None]:
def build_scoring_dataframe(words, recognizer_ipas, mfa_ipas, formant_pairs,
                            recognizer_refs, mfa_refs, audio_frame_list, ipa_embedding_list):
    records = []
    for i in range(len(words)):
        record = score_alignment(
            word=words[i],
            recognizer_ipa=recognizer_ipas[i],
            mfa_ipa=mfa_ipas[i],
            f1=formant_pairs[i][0],
            f2=formant_pairs[i][1],
            recognizer_ref_formants=recognizer_refs[i],
            mfa_ref_formants=mfa_refs[i],
            audio_frames=audio_frame_list[i],
            ipa_embeddings=ipa_embedding_list[i]
        )
        records.append(record)
    return pd.DataFrame.from_records(records)


In [None]:
def save_textgrid(word, recognizer_ipa, mfa_ipa, flag, duration, output_dir="textgrids"):
    tg = TextGrid(minTime=0, maxTime=duration)
    rec_tier = IntervalTier(name="Recognizer", minTime=0, maxTime=duration)
    mfa_tier = IntervalTier(name="MFA", minTime=0, maxTime=duration)
    flag_tier = IntervalTier(name="Flag", minTime=0, maxTime=duration)
    rec_tier.add(0, duration, recognizer_ipa)
    mfa_tier.add(0, duration, mfa_ipa)
    flag_tier.add(0, duration, flag)
    tg.append(rec_tier)
    tg.append(mfa_tier)
    tg.append(flag_tier)
    os.makedirs(output_dir, exist_ok=True)
    path = os.path.join(output_dir, f"{word}.TextGrid")
    tg.write(path)
    return path


In [None]:
def plot_formant_scatter(formants, refs, labels):
    f1s, f2s = zip(*formants)
    ref1s, ref2s = zip(*refs)
    plt.figure(figsize=(6, 5))
    plt.scatter(f2s, f1s, label="Measured", color="blue")
    plt.scatter(ref2s, ref1s, label="Reference", color="orange", marker='x')
    for i, label in enumerate(labels):
        plt.text(f2s[i], f1s[i], label, fontsize=9)
    plt.gca().invert_yaxis()
    plt.xlabel("F2")
    plt.ylabel("F1")
    plt.title("Formant Comparison")
    plt.legend()
    plt.grid(True)
    plt.show()
