In [1]:
%pip install jiwer

Collecting jiwer
  Using cached jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting click>=8.1.8 (from jiwer)
  Downloading click-8.2.0-py3-none-any.whl.metadata (2.5 kB)
Using cached jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading click-8.2.0-py3-none-any.whl (102 kB)
Installing collected packages: click, jiwer
  Attempting uninstall: click
    Found existing installation: click 8.0.4
    Uninstalling click-8.0.4:
      Successfully uninstalled click-8.0.4
Successfully installed click-8.2.0 jiwer-3.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import jiwer
from jiwer import cer, wer, Compose, RemoveWhiteSpace
import os
import re

In [None]:
df = pd.read_csv('stt_results.csv')

dfs = pd.read_csv('stt_whisper_results.csv')
data_list = pd.concat([df, dfs], ignore_index=True)

print(len(data_list))  # 두 CSV 파일 행 합친 개수
print(data_list["engine"])

gdfs = pd.read_csv('stt_cer_results.csv')

137
0        CLOVA
1        CLOVA
2        CLOVA
3        CLOVA
4        CLOVA
        ...   
132    Whisper
133    Whisper
134    Whisper
135    Whisper
136    Whisper
Name: engine, Length: 137, dtype: object


In [24]:
def colculate_cer(ref, hyp_txt):
    cer_s = cer(ref, hyp_txt)
    return cer_s

In [19]:
p_double = re.compile(r'\(([^)]+)\)/\([^)]+\)')
p_mention = re.compile(r'@([가-힣]+)\d*')
p_slash = re.compile(r'/\(([^)]+)\)*')
p_del1 = re.compile(r'\(*')
p_del2 = re.compile(r'\)*')
p_speaker = re.compile(r'\b\d+:\s*')
p_space = re.compile(r'\s+')

def clean_line(line: str) -> str:
    # 1) 이중 괄호 표기: (A)/(B) → A
    line = p_double.sub(r'\1', line)
    # 2) 멘션 삭제: @이름10 등
    line = p_mention.sub(r'\1', line)
    # 3) /(?) 제거: "/(idiom)" → ""
    line = p_slash.sub('', line)
    # 괄호 삭제
    line = p_del1.sub('', line)
    line = p_del2.sub('', line)
    # 4) 화자 번호 삭제: "4:" or "10:" etc.
    line = p_speaker.sub('', line)
    # 6) 중복 공백→한 칸
    line = p_space.sub(' ', line).strip()
    return line


def clean_text(infile: str, outfile: str):
    infile = "text.txt"
    outfile = "text.txt"

    with open(infile, 'r', encoding='utf-8') as fin:
        raws = [line.rstrip('\n') for line in fin if line.strip()]
    rst = []
    for s in raws:
        out = clean_line(s)
        if out:
            rst.append(out + '\n')
            print(out)
    with open(outfile, 'w', encoding='utf-8') as fout:
        fout.writelines(rst)

In [None]:

def process_one_(script_dir, basename, engine):
    """
    audio_dir: .wav 파일 폴더
    script_dir: 정답 스크립트(.txt) 폴더
    json_dir: 메타데이터 폴더
    basename: 확장자 제외 공통 파일명
    engine: STT_FUNCTIONS 키
    """
    domain = os.path.basename(script_dir)

    # 2) 정답 스크립트(ground-truth) GPU 전처리
    txt_path = os.path.join(script_dir, f"{basename}.txt")
    with open(txt_path, encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    hyp_txt = data_list[(data_list["basename"]==basename)&(data_list["engine"]==engine)]["transcript"].iloc[0]
    # 하나로 합친 뒤 WER
    ref = " ".join(lines)
    hyp_txt = hyp_txt.replace('\n', ' ')
    file_cer = colculate_cer(ref, hyp_txt)
    file_wer = data_list[(data_list["basename"]==basename)&(data_list["engine"]==engine)]["wer"].iloc[0]
    # 5) 비용 계산 (분당)
    stt_time_sec = data_list[(data_list["basename"]==basename)&(data_list["engine"]==engine)]["stt_time_sec"].iloc[0]
    cost = data_list[
    (data_list["basename"] == basename) &
    (data_list["engine"] == engine)
        ]["cost_usd"].iloc[0]
    return {
        'basename':      basename,
        'domain':        domain,
        'engine':        engine,
        'stt_time_sec': stt_time_sec,
        'cost' : cost,
        'wer':           round(file_wer, 3),
        'cer':  round(file_cer, 3)
    }

In [21]:
root_audio_dir = './VS1/data'
root_script_dir = './VS1/data'
domains = [
    d for d in os.listdir(root_script_dir)
    if os.path.isdir(os.path.join(root_script_dir, d))
]
engines = [
    "Whisper",
    "CLOVA",
    "Google STT",
    "AWS",
    "Azure"
]
print(domains)

['경제', '교육', '사회', '생활', '세계', '스포츠', '연예', '의료', '정치']


In [29]:
import glob
results = []

for domain in domains:
    script_dir = os.path.join(root_script_dir, domain)

    basenames = [
        os.path.splitext(os.path.basename(p))[0]
        for p in glob.glob(os.path.join(script_dir, '*.txt'))
    ]

    for b in basenames:
        for eng in engines:
            result = process_one_(script_dir, b, eng)
            results.append(result)


In [30]:
gdf = pd.DataFrame(results)

    # 8) 파일별 전사결과도 별도 CSV로
gdf.to_csv(
        'stt_cer_results.csv', index=False, encoding='utf-8-sig'
    )

In [None]:

def plot_stt_summary_wer(summary_df):
    # ✅ 한글 폰트 자동 설정
    font_path = ""
    font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf')
    for font in font_list:
        if "SUITE" in font or "Malgun" in font:
            font_path = font
            break
    if font_path:
        font_name = fm.FontProperties(fname=font_path).get_name()
        plt.rcParams["font.family"] = font_name
        plt.rcParams["axes.unicode_minus"] = False
    print(font_path)

    # ✅ 비용 단위 정리
    summary_df = summary_df.copy()
    summary_df["wer"] = summary_df["wer"].round(4)
    summary_df["cost_usd"] = (summary_df["cost_usd"] * 1000).round(2)  # 원 단위

    # 📊 표 출력
    print("📋 STT 시스템별 wer / 처리 시간 / 비용 요약 (단위: %, 초, 원)")
    display(summary_df[["engine", "wer", "stt_time_sec", "cost_usd"]])

    # 🎨 시각화
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # wer (막대)
    bar_container = ax1.bar(summary_df["engine"], summary_df["wer"],
                            color="salmon", label="wer")
    ax1.set_ylabel("wer", color="salmon")
    ax1.tick_params(axis='y', labelcolor="salmon")
    ax1.set_ylim(0, max(summary_df["wer"]) * 1.2)

    # 처리 시간/비용 (선)
    ax2 = ax1.twinx()
    line1 = ax2.plot(summary_df["engine"], summary_df["stt_time_sec"],
                     color="blue", marker="o", label="처리 시간 (초)")
    line2 = ax2.plot(summary_df["engine"], summary_df["cost_usd"],
                     color="green", marker="o", label="비용 (원)")  # ✅ 초록색으로 변경
    ax2.set_ylabel("처리 시간 / 비용", color="black")
    ax2.tick_params(axis='y')

    # 범례 병합
    lines = [bar_container[0]] + line1 + line2
    labels = [line.get_label() for line in lines]
    ax1.legend(lines, labels, loc="upper center")

    plt.title("STT 시스템별 성능 비교 (wer + 처리 시간 + 비용)")
    plt.grid(axis="y")
    plt.tight_layout()
    plt.show()


# tnwjd
df_results = pd.DataFrame(data_list)
summary_df = df_results.groupby(["engine"]).agg({
    "stt_time_sec": "mean",
    "wer": "mean",
    "cost_usd": "mean"
}).reset_index().round(4)

plot_stt_summary_wer(summary_df)