In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, WhisperProcessor, AutoTokenizer
from datasets import load_dataset
import evaluate


In [33]:
# 저장된 모델과 프로세서 로드
model = AutoModelForSpeechSeq2Seq.from_pretrained("./checkpoint-5000")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

In [3]:
dataset = load_dataset("CAiRE/ASCEND")

In [5]:
metric = evaluate.load("wer")

In [11]:
feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer

In [19]:
def prepare_dataset(batch):
    audio = batch["audio"]
    input_features = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["input_features"] = input_features
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [None]:
# Ascend 데이터셋 전처리
dataset1 = dataset["test"].map(prepare_dataset, remove_columns=["audio", "transcription"])

In [None]:
dataset2 = dataset["test"].map(prepare_dataset, remove_columns=["audio", "transcription"])

In [30]:
def evaluate(batch):
    model.eval()
    with torch.no_grad():
        # 이미 제공된 input_features와 labels 사용
        input_values = torch.tensor(batch["input_features"]).unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")
        labels = torch.tensor(batch["labels"]).unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")
        
        # 디코더 입력값 생성
        decoder_input_ids = labels[:, :-1]  # 마지막 토큰 제외

        # 모델 추론
        outputs = model(input_values, decoder_input_ids=decoder_input_ids)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)

        # 텍스트 디코딩
        predicted_texts = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        label_texts = processor.batch_decode(labels, skip_special_tokens=True)
        
        # WER 계산
        batch["wer"] = metric.compute(predictions=predicted_texts, references=label_texts)
        return batch


In [None]:
# 6. 평가 실행
results1 = dataset1.map(evaluate, batched=False)
average_wer1 = sum(results1["wer"]) / len(results1["wer"])

Map:   0%|          | 0/1315 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
average_wer1

1.0

In [34]:
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")

In [None]:
# 6. 평가 실행
results2 = dataset2.map(evaluate, batched=False)
average_wer2 = sum(results2["wer"]) / len(results2["wer"])



Map:   0%|          | 0/1315 [00:00<?, ? examples/s]

In [None]:
average_wer2

2.0231580203576938

In [37]:
wer_debug = metric.compute(
    predictions=["hello world"],
    references=["hello there"]
)
print("Sample WER Calculation:", wer_debug)

Sample WER Calculation: 0.5


In [15]:

import torch

# 1. 모델 및 프로세서 로드
model = AutoModelForSpeechSeq2Seq.from_pretrained("./finetune_model_cs")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

model.config.suppress_tokens = []
model.config.forced_decoder_ids = None

# 2. 데이터셋에서 하나의 오디오 샘플 가져오기
sample = dataset['test'][2]  # 첫 번째 샘플 (index 조정 가능)
audio = sample["audio"]

# 3. 오디오 데이터를 모델 입력값으로 변환
inputs = processor(
    audio["array"],
    sampling_rate=audio["sampling_rate"],
    return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")

# inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
attention_mask = inputs.get("attention_mask", None)

# 4. 모델 추론
model.eval()
with torch.no_grad():
    generated_ids = model.generate(inputs["input_features"], attention_mask=attention_mask, language=None)

# 5. 예측 텍스트 디코딩
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# 6. 결과 출력
print(f"Original Audio Path: {sample['path']}")
print(f"Transcription: {transcription}")

Original Audio Path: /storage/hf-datasets-cache/all/datasets/16739474757983-config-parquet-and-info-CAiRE-ASCEND-5c1abf9c/downloads/extracted/f0790e45797bd654a35ecd1eb4865fa761f1cbd842b674e0defb6812ae8cffbf/waves/ses1_spk17_L3825_16.5740_2.8760.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [51]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [52]:
pipe(audio)



{'text': ' with your major and your home base.'}

pipe(audio)