In [63]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch.nn.functional as F
import torch
import torchaudio


In [64]:

# 모델 및 프로세서 로드
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo")



In [65]:
from datasets import load_dataset, Audio


dataset = load_dataset("CAiRE/ASCEND")

In [66]:
# 음성 파일 로드 및 전처리
audio_data = dataset['train'][41]['audio']  # audio 데이터 가져오기
waveform = audio_data["array"]  # waveforms 추출
sample_rate = audio_data["sampling_rate"]  # 샘플링 레이트 추출

# 다중 채널 오디오를 단일 채널로 변환 (평균)
if len(waveform.shape) > 1 and waveform.shape[0] > 1:
    waveform = waveform.mean(axis=0)

# 샘플링 레이트 변환
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(torch.tensor(waveform))
waveform_np = waveform.numpy()

# 입력 데이터 생성
inputs = processor(waveform_np, sampling_rate=16000, return_tensors="pt")

# 언어 프롬프트 설정
forced_decoder_ids_en = processor.get_decoder_prompt_ids(language="english", task="transcribe")
forced_decoder_ids_zh = processor.get_decoder_prompt_ids(language="chinese", task="transcribe")


In [67]:
# 영어 logits 계산
with torch.no_grad():
    outputs_en = model.generate(
        inputs.input_features,
        forced_decoder_ids=forced_decoder_ids_en,
        return_dict_in_generate=True,
        output_scores=True
    )
logits_en = outputs_en.scores  # 영어 확률 분포

# 한국어 logits 계산
with torch.no_grad():
    outputs_zh = model.generate(
        inputs.input_features,
        forced_decoder_ids=forced_decoder_ids_zh,
        return_dict_in_generate=True,
        output_scores=True
    )
logits_zh = outputs_zh.scores  # 한국어 확률 분포



In [68]:
penalty_value = 5.0  # 영어에 부여할 패널티 값 (적절한 값으로 설정)

# 영어 로짓에 패널티 적용
english_logits = [logit - penalty_value for logit in logits_en]

# 확률 계산
english_probs = [F.softmax(logit, dim=-1) for logit in english_logits]
chinese_probs = [F.softmax(logit, dim=-1) for logit in logits_zh]

# Code-switching 비교
inference_text = []
code_switching_result = []
for frame_idx in range(len(english_probs)):
    english_prob = english_probs[frame_idx].max().item()
    chinese_prob = chinese_probs[frame_idx].max().item()

    if english_prob > chinese_prob:
        selected_lang = "English"
        selected_word = processor.tokenizer.decode(english_probs[frame_idx].argmax().item())
    else:
        selected_lang = "Chinese"
        selected_word = processor.tokenizer.decode(chinese_probs[frame_idx].argmax().item())
    
    inference_text.append(selected_word)
    code_switching_result.append((frame_idx, selected_lang, selected_word))

In [69]:
from sentence_transformers import SentenceTransformer, util

# Inference 텍스트 결합
inference_text = " ".join(inference_text)

# Gold 텍스트 가져오기
gold_text = dataset['train'][41]['transcription']

# 의미적 유사도 계산
model2 = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
gold_embedding = model2.encode(gold_text, convert_to_tensor=True)
inference_embedding = model2.encode(inference_text, convert_to_tensor=True)
similarity = util.cos_sim(gold_embedding, inference_embedding).item()

# 결과 출력
print(f"Original transcription (Gold): {gold_text}")
print(f"Inference transcription: {inference_text}")
print(f"Semantic Similarity: {similarity:.4f}")

# 프레임별 결과 출력
for frame, lang, word in code_switching_result:
    print(f"Frame {frame}: {lang} -> {word}")

Original transcription (Gold): i remember呃在我本科的时候我花过挺长的时间
Inference transcription:  I  remember , 在 我 本 科  I , 我有 花 過  time 長
Semantic Similarity: 0.6353
Frame 0: English ->  I
Frame 1: English ->  remember
Frame 2: Chinese -> ,
Frame 3: Chinese -> 在
Frame 4: Chinese -> 我
Frame 5: Chinese -> 本
Frame 6: Chinese -> 科
Frame 7: English ->  I
Frame 8: Chinese -> ,
Frame 9: Chinese -> 我有
Frame 10: Chinese -> 花
Frame 11: Chinese -> 過
Frame 12: English ->  time
Frame 13: Chinese -> 長


•    The Whisper model uses Byte Pair Encoding (BPE) to tokenize text.
    <br>•    BPE breaks down text into frequently used subword units, which can cause some words to be split or appear shorter.
    <br>•    Especially for languages like Chinese, where each character carries meaning, BPE may fail to combine tokens appropriately.
<br>If we can find some other language dataset, we may get much better result