In [1]:
# # 필요한 패키지 설치 (한 번만 실행하면 됨)
# !pip install pydub praatio matplotlib
# !brew install ffmpeg  # macOS 사용 시 필수 (Linux: sudo apt install ffmpeg)

In [2]:
import os
import shutil
import subprocess
from pydub import AudioSegment
from praatio import textgrid
import matplotlib.pyplot as plt

# === 1. 경로 설정 ===
base_dir = os.getcwd()
m4a_dir = os.path.join(base_dir, "m4a")  # 원본 m4a 파일 경로
wav_dir = os.path.join(base_dir, "wav")
txt_dir = os.path.join(base_dir, "wav")
lexicon_path = os.path.join(base_dir, "lexicon.txt")
# model_path = os.path.join(base_dir, "korean_model.zip")
model_path = os.path.join(
    "/Users/jlee/Documents/MFA/pretrained_models/acoustic", "korean_mfa.zip"
)
output_dir = os.path.join(base_dir, "aligned")

os.makedirs(wav_dir, exist_ok=True)
os.makedirs(txt_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

In [3]:
def convert_m4a_to_wav(m4a_path, wav_path, target_rate=16000):
    sound = AudioSegment.from_file(m4a_path, format="m4a")
    sound = sound.set_channels(1)  # mono
    sound = sound.set_frame_rate(target_rate)  # 16kHz
    sound.export(wav_path, format="wav")
    print(f"✅ 변환 완료: {wav_path}")


# 변환 실행 - m4a 폴더 내 모든 파일 처리
for fname in os.listdir(m4a_dir):
    if fname.endswith(".m4a"):
        name = os.path.splitext(fname)[0]
        m4a_path = os.path.join(m4a_dir, fname)
        wav_path = os.path.join(wav_dir, f"{name}.wav")
        convert_m4a_to_wav(m4a_path, wav_path)

✅ 변환 완료: /Users/jlee/JDrvie/Dev/Koach/Trash/wav/native.wav
✅ 변환 완료: /Users/jlee/JDrvie/Dev/Koach/Trash/wav/learner.wav


In [5]:
from g2pk import G2p


def generate_lexicon_with_g2pk(script_path, lexicon_path):
    g2p = G2p()

    with open(script_path, "r", encoding="utf-8") as f:
        text = f.read()

    words = sorted(set(text.strip().split()))

    with open(lexicon_path, "w", encoding="utf-8") as f:
        for word in words:
            pron = g2p(word)  # 실제 발음
            phonemes = " ".join(pron.replace(" ", ""))  # 공백 제거 후 자모 분리
            f.write(f"{word}\t{phonemes}\n")

    print("✅ 정확한 lexicon.txt 생성 완료 (g2pk 사용)")


# 사용 예시
generate_lexicon_with_g2pk("./wav/learner.txt", "lexicon.txt")

✅ 정확한 lexicon.txt 생성 완료 (g2pk 사용)


In [6]:
import subprocess
import shutil
import os


def run_mfa_alignment(wav_dir, lexicon_path, model_path, output_dir):
    if not shutil.which("mfa"):
        raise EnvironmentError(
            "❌ MFA 명령어를 찾을 수 없습니다. 설치 여부를 확인하세요."
        )

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        print(f"🧹 기존 정렬 결과 삭제: {output_dir}")

    command = [
        "mfa",
        "align",
        wav_dir,
        lexicon_path,
        model_path,
        output_dir,
        "--clean",
        "-o",
    ]

    print("🚀 MFA 정렬 시작...\n" + " ".join(command))
    try:
        result = subprocess.run(
            command,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        print("✅ 정렬 완료!")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("❌ 정렬 실패!")
        print("🔻 STDOUT:")
        print(e.stdout)
        print("🔻 STDERR:")
        print(e.stderr)


# 실행
run_mfa_alignment(wav_dir, lexicon_path, model_path, output_dir)

🧹 기존 정렬 결과 삭제: /Users/jlee/JDrvie/Dev/Koach/Trash/aligned
🚀 MFA 정렬 시작...
mfa align /Users/jlee/JDrvie/Dev/Koach/Trash/wav /Users/jlee/JDrvie/Dev/Koach/Trash/lexicon.txt /Users/jlee/Documents/MFA/pretrained_models/acoustic/korean_mfa.zip /Users/jlee/JDrvie/Dev/Koach/Trash/aligned --clean -o


KeyboardInterrupt: 

In [None]:
def load_textgrid(file_path):
    tg = tgio.openTextgrid(file_path, includeEmptyIntervals=True)
    return tg


def plot_word_tiers(tg, speaker_label, tier_name="word"):
    tier = tg.tierDict[tier_name]
    entries = tier.entries

    words = []
    start_times = []
    durations = []

    for word, start, end in entries:
        if word.strip() == "":
            continue
        words.append(word)
        start_times.append(start)
        durations.append(end - start)

    fig, ax = plt.subplots(figsize=(10, 1.5))
    ax.barh(
        [0] * len(words),
        durations,
        left=start_times,
        height=0.4,
        align="center",
        color="skyblue",
    )
    for i, word in enumerate(words):
        ax.text(
            start_times[i] + durations[i] / 2,
            0,
            word,
            ha="center",
            va="center",
            fontsize=10,
        )

    ax.set_title(f"{speaker_label} 발화 - 단어 정렬")
    ax.set_xlabel("시간 (초)")
    ax.set_yticks([])
    ax.set_xlim(
        0, max(start_times[i] + durations[i] for i in range(len(durations))) + 0.5
    )
    plt.tight_layout()
    plt.show()

In [None]:
def analyze_word_timings(native_tg, learner_tg, tier_name="word"):
    tier_native = native_tg.tierDict[tier_name].entries
    tier_learner = learner_tg.tierDict[tier_name].entries

    native_map = {
        word: end - start for word, start, end in tier_native if word.strip() != ""
    }
    learner_map = {
        word: end - start for word, start, end in tier_learner if word.strip() != ""
    }

    print("\n📊 단어별 발화 시간 비교:")
    print(
        "{:<10} {:>10} {:>10} {:>10}".format(
            "단어", "원어민(s)", "학습자(s)", "차이(s)"
        )
    )
    print("-" * 40)
    for word in native_map:
        if word in learner_map:
            native_dur = native_map[word]
            learner_dur = learner_map[word]
            diff = learner_dur - native_dur
            print(f"{word:<10} {native_dur:>10.2f} {learner_dur:>10.2f} {diff:>+10.2f}")
        else:
            print(f"{word:<10} {'(없음)':>10} {'(없음)':>10} {'(불가)':>10}")

In [None]:
# TextGrid 로딩
learner_tg_path = os.path.join(output_dir, "learner.TextGrid")
native_tg_path = os.path.join(output_dir, "native.TextGrid")

learner_tg = load_textgrid(learner_tg_path)
native_tg = load_textgrid(native_tg_path)

# 시각화
plot_word_tiers(native_tg, "원어민")
plot_word_tiers(learner_tg, "학습자")

# 시간 비교
analyze_word_timings(native_tg, learner_tg)