In [None]:
import google.genai as genai
import pandas as pd

# PROJECT_ID = '$YOUR_PROJECT_ID'
client = genai.Client(vertexai=False, api_key="$YOUR_GEMINI_API_KEY")

In [None]:
import evaluate
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [None]:
df_test_kr = pd.read_csv('./test-kr.csv')
df_test_en = pd.read_csv('./test-en.csv')
df_test_cmn = pd.read_csv('./test-cmn.csv')

### Test Korean STT

In [None]:
initial_prompt = 'Generate a transcript of the speech. Only include the transcript in your response, and do not provide any other answer'
initial_prompt_korean = '주어진 음성 파일을 텍스트로 변환하세요. 응답에는 다른 어떠한 내용도 추가하지 마십시오.'

# exact_match
optimized_prompt = '''음성 내용을 텍스트로 변환하세요. 다음 지침을 엄격히 준수하여 응답을 생성하십시오:
1. 텍스트 변환 내용에는 어떠한 문장 부호(마침표, 쉼표 등)도 포함하지 마십시오.
2. 숫자는 원본 음성에서 발음된 형태를 그대로 따르십시오. (예: '이백만'이 발음되면 '이백만'으로, '열여덟'이 발음되면 '열여덟'으로, '8명'이 발음되면 '8명'으로 표기)
3. 어휘, 문장 구성, 그리고 단어 간 띄어쓰기는 원본 음성의 표현 방식을 최대한 충실하고 정확하게 따르십시오. 특히, 복합어의 띄어쓰기에 유의하십시오.
응답에는 텍스트 변환 내용만 포함하고, 다른 답변은 제공하지 마세요.'''

In [None]:
df_test_kr['initial_result'] = None
df_test_kr['initial_korean_result'] = None
df_test_kr['optimized_result'] = None

In [None]:
for idx in range(len(df_test_kr)):
    temp_wav_path = df_test_kr['audio'][idx]
    myfile = client.files.upload(file=temp_wav_path)

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[initial_prompt, myfile]
    )

    df_test_kr.loc[idx, 'initial_result'] = response.text

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[initial_prompt_korean, myfile]
    )

    df_test_kr.loc[idx, 'initial_korean_result'] = response.text

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[optimized_prompt, myfile]
    )

    df_test_kr.loc[idx, 'optimized_result'] = response.text

In [None]:
col_list = ['initial_result', 'initial_korean_result', 'optimized_result']

for col in col_list:
    wer_score = wer_metric.compute(predictions=df_test_kr[col], references=df_test_kr['target'])
    cer_score = cer_metric.compute(predictions=df_test_kr[col], references=df_test_kr['target'])

    print(f"{col} WER : {wer_score:.4f}")
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
import string

def remove_punctuation(text):    
    punctuations = string.punctuation
    translator = str.maketrans('', '', punctuations)
    
    # 3. 테이블을 이용하여 텍스트 변환 및 반환
    return text.translate(translator)


df_test_kr['initial_result_no_punc'] = df_test_kr['initial_result'].apply(lambda x: remove_punctuation(x))
df_test_kr['initial_korean_result_no_punc'] = df_test_kr['initial_korean_result'].apply(lambda x: remove_punctuation(x))
df_test_kr['optimized_result_no_punc'] = df_test_kr['optimized_result'].apply(lambda x: remove_punctuation(x))
df_test_kr['target_no_punc'] = df_test_kr['target'].apply(lambda x: remove_punctuation(x))

In [None]:
col_list = ['initial_result_no_punc', 'initial_korean_result_no_punc' ,'optimized_result_no_punc']

for col in col_list:
    wer_score = wer_metric.compute(predictions=df_test_kr[col], references=df_test_kr['target_no_punc'])
    cer_score = cer_metric.compute(predictions=df_test_kr[col], references=df_test_kr['target_no_punc'])

    print(f"{col} WER : {wer_score:.4f}")
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
# df_test_kr['initial_wer'] = df_test_kr.apply(lambda x: wer_metric.compute(predictions=[x.initial_result_no_punc],references=[x.target_no_punc]) ,axis=1)
# df_test_kr['initial_cer'] = df_test_kr.apply(lambda x: cer_metric.compute(predictions=[x.initial_result_no_punc],references=[x.target_no_punc]) ,axis=1)


# df_test_kr['optimized_wer'] = df_test_kr.apply(lambda x: wer_metric.compute(predictions=[x.optimized_result_no_punc],references=[x.target_no_punc]) ,axis=1)
# df_test_kr['optimized_cer'] = df_test_kr.apply(lambda x: cer_metric.compute(predictions=[x.optimized_result_no_punc],references=[x.target_no_punc]) ,axis=1)

# print(f"Optimized showed better WER : {df_test_kr[df_test_kr['initial_wer'] > df_test_kr['optimized_wer']].shape[0]}")
# print(f"Initial showed better WER : {df_test_kr[df_test_kr['initial_wer'] < df_test_kr['optimized_wer']].shape[0]}")
# print(f"Same WER : {df_test_kr[df_test_kr['initial_wer'] == df_test_kr['optimized_wer']].shape[0]}")

# print(f"Optimized showed better CER : {df_test_kr[df_test_kr['initial_cer'] > df_test_kr['optimized_cer']].shape[0]}")
# print(f"Initial showed better CER : {df_test_kr[df_test_kr['initial_cer'] < df_test_kr['optimized_cer']].shape[0]}")
# print(f"Same CER : {df_test_kr[df_test_kr['initial_cer'] == df_test_kr['optimized_cer']].shape[0]}")

In [None]:
# df_test_kr.sort_values(by='cer_diff', ascending=False)[:5]
# df_test_kr.sort_values(by='wer_diff', ascending=False)[:5]


### Test English STT

In [None]:
initial_prompt_en = 'Generate a transcript of the speech. Only include the transcript in your response, and do not provide any other answer'
optimized_prompt_en = "Generate a transcript of the speech. The transcript must be in all lowercase. The transcript must precisely reflect the spoken content, transcribing only the exact words spoken without adding, omitting, or substituting any words. Strive for accurate phonetic transcription and correct spelling of all words. It is paramount to correctly identify and transcribe proper nouns, ensuring their exact and accurate spelling based on common or established forms, even when they sound phonetically similar to common phrases or words. Numerical values, including years, must be transcribed as digits (e.g., '1767', not 'seventeen sixty-seven'). Retain punctuation only when it is an intrinsic part of a word's spelling, such as apostrophes in contractions or hyphens in compound words, especially for compound adjectives that modify a noun (e.g., 'full-iron'). Omit all other punctuation. Only include the transcript in your response, and do not provide any other answer."

In [None]:
df_test_en['initial_result'] = None
df_test_en['optimized_result'] = None

In [None]:
for idx in range(len(df_test_en)):
    temp_wav_path = df_test_en['audio'][idx]
    myfile = client.files.upload(file=temp_wav_path)

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[initial_prompt_en, myfile]
    )

    df_test_en.loc[idx, 'initial_result'] = response.text

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[optimized_prompt_en, myfile]
    )

    df_test_en.loc[idx, 'optimized_result'] = response.text

In [None]:
col_list = ['initial_result', 'optimized_result']

for col in col_list:
    wer_score = wer_metric.compute(predictions=df_test_en[col], references=df_test_en['target'])
    cer_score = cer_metric.compute(predictions=df_test_en[col], references=df_test_en['target'])

    print(f"{col} WER : {wer_score:.4f}")
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
# Remove punctuation & lowercasing
df_test_en['initial_result_no_punc'] = df_test_en['initial_result'].apply(lambda x: remove_punctuation(x.lower()))
df_test_en['optimized_result_no_punc'] = df_test_en['optimized_result'].apply(lambda x: remove_punctuation(x.lower()))
df_test_en['target_no_punc'] = df_test_en['target'].apply(lambda x: remove_punctuation(x.lower()))

col_list = ['initial_result_no_punc', 'optimized_result_no_punc']

for col in col_list:
    wer_score = wer_metric.compute(predictions=df_test_en[col], references=df_test_en['target_no_punc'])
    cer_score = cer_metric.compute(predictions=df_test_en[col], references=df_test_en['target_no_punc'])

    print(f"{col} WER : {wer_score:.4f}")
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
# df_test_en['initial_wer'] = df_test_en.apply(lambda x: wer_metric.compute(predictions=[x.initial_result_no_punc],references=[x.target_no_punc]) ,axis=1)
# df_test_en['initial_cer'] = df_test_en.apply(lambda x: cer_metric.compute(predictions=[x.initial_result_no_punc],references=[x.target_no_punc]) ,axis=1)

# df_test_en['optimized_wer'] = df_test_en.apply(lambda x: wer_metric.compute(predictions=[x.optimized_result_no_punc],references=[x.target_no_punc]) ,axis=1)
# df_test_en['optimized_cer'] = df_test_en.apply(lambda x: cer_metric.compute(predictions=[x.optimized_result_no_punc],references=[x.target_no_punc]) ,axis=1)

# df_test_en['wer_diff'] = df_test_en['initial_wer'] - df_test_en['optimized_wer']
# df_test_en['cer_diff'] = df_test_en['initial_cer'] - df_test_en['optimized_cer']


### Test Chinese (Mandarin) STT

In [None]:
initial_prompt = 'Generate a transcript of the speech. Only include the transcript in your response, and do not provide any other answer. Please answer in Simplified Chinese.'
initial_prompt_cmn = '生成演讲稿的文字记录。在您的回复中只包含文字记录，不要提供任何其他答案。请使用简体中文回答。'
optimized_prompt_cmn = '请根据提供的音频标识符，生成对应的标准文本内容。转录文本应不包含任何标点符号。数字应优先使用阿拉伯数字（0-9）转录。但对于中文习惯中以汉字形式出现的数字（如序数词、量词或固定搭配），应保留其汉字形式。文本中，每个汉字之间（包括组成词语的汉字）都必须用一个空格隔开。您的回复中只包含符合此格式要求的转录文本，不要提供任何其他信息。'

In [None]:
df_test_cmn['initial_result'] = None
df_test_cmn['initial_mandarin_result'] = None
df_test_cmn['optimized_result'] = None

In [None]:
for idx in range(len(df_test_cmn)):
    temp_wav_path = df_test_cmn['audio'][idx]
    myfile = client.files.upload(file=temp_wav_path)

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[initial_prompt, myfile]
    )

    df_test_cmn.loc[idx, 'initial_result'] = response.text

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[initial_prompt_cmn, myfile]
    )

    df_test_cmn.loc[idx, 'initial_mandarin_result'] = response.text

    response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[optimized_prompt_cmn, myfile]
    )

    df_test_cmn.loc[idx, 'optimized_result'] = response.text

In [None]:
col_list = ['initial_result', 'initial_mandarin_result', 'optimized_result']

for col in col_list:
    wer_score = wer_metric.compute(predictions=df_test_cmn[col], references=df_test_cmn['target'])
    cer_score = cer_metric.compute(predictions=df_test_cmn[col], references=df_test_cmn['target'])

    print(f"{col} WER : {wer_score:.4f}")
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
df_test_cmn['target_nospace'] = df_test_cmn['target'].apply(lambda x: x.replace(' ', ''))
df_test_cmn['initial_result_nospace'] = df_test_cmn['initial_result'].apply(lambda x: x.replace(' ', ''))
df_test_cmn['initial_mandarin_result_nospace'] = df_test_cmn['initial_mandarin_result'].apply(lambda x: x.replace(' ', ''))
df_test_cmn['optimized_result_nospace'] = df_test_cmn['optimized_result'].apply(lambda x: x.replace(' ', ''))

In [None]:
col_list = ['initial_result_nospace', 'initial_mandarin_result_nospace', 'optimized_result_nospace']

for col in col_list:
    cer_score = cer_metric.compute(predictions=df_test_cmn[col], references=df_test_cmn['target_nospace'])
    print(f"{col} CER : {cer_score:.4f}")

In [None]:
# remove punctuation & remove whitespace
df_test_cmn['initial_result_no_punc'] = df_test_cmn['initial_result_nospace'].apply(lambda x: remove_punctuation(x))
df_test_cmn['initial_mandarin_result_no_punc'] = df_test_cmn['initial_mandarin_result_nospace'].apply(lambda x: remove_punctuation(x))
df_test_cmn['optimized_result_no_punc'] = df_test_cmn['optimized_result_nospace'].apply(lambda x: remove_punctuation(x))
df_test_cmn['target_no_punc'] = df_test_cmn['target_nospace'].apply(lambda x: remove_punctuation(x.lower()))

col_list = ['initial_result_no_punc', 'initial_mandarin_result_no_punc', 'optimized_result_no_punc']

for col in col_list:
    cer_score = cer_metric.compute(predictions=df_test_cmn[col], references=df_test_cmn['target_no_punc'])
    print(f"{col} CER : {cer_score:.4f}")