<a href="https://colab.research.google.com/github/kimseungwoo99/fine-tuning-whisper/blob/main/finetuning_whipser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install evaluate>=0.30
!pip install jiwer
!pip install accelerate -U
!pip install transformers[torch]
!pip install wandb

In [None]:
# 훈련이 끝난 모델을 HuggingFace Hub에 업로드하기 위해 로그인
# 비공개 혹은 제한된 공개의 데이터셋에 접근할 경우에도 로그인이 필요하다.
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import WhisperFeatureExtractor
# 파인튜닝을 진행하고자 하는 모델의 feature extractor를 로드
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [None]:
from transformers import WhisperTokenizer
# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
input_str = "안녕하세요. 시작해보겠습니다."
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
from tqdm import tqdm
import pandas as pd

In [None]:
# csv 파일을 로드하여 df로 변환
df = pd.read_csv("/content/drive/MyDrive/dataframe/df_total.csv", index_col=0)

In [None]:
# Null data 유무 확인
df.isnull().values.sum()

In [None]:
# interval 열 삭제
df = df.drop('interval', axis=1)

In [None]:
df.head()

In [None]:
import pandas as pd
import re

# file_name 열 제외한 나머지 열 전처리
columns_to_preprocess = df.columns[df.columns != 'audio_file']  # 'Text' 열을 제외한 열 선택

for column in columns_to_preprocess:
    df[column] = df[column].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)).replace('\n', ' ').strip())  # 정규식을 사용하여 전처리 및 공백 제거


# 전처리된 데이터프레임 출력
print(df)


In [None]:
from datasets import Dataset, DatasetDict
from datasets import Audio

In [None]:
# 오디오 파일 경로를 dict의 "audio" 키의 value로 넣고 이를 데이터셋으로 변환
# 이때, Whisper가 요구하는 사양대로 Sampling rate는 16,000으로 설정한다.
ds = Dataset.from_dict({"audio": [path for path in df["audio_file"]],
                       "Text": [transcript for transcript in df["sentence"]]}).cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# 데이터셋을 훈련 데이터와 테스트 데이터, 밸리데이션 데이터로 분할
train_testvalid = ds.train_test_split(test_size=0.2)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5)
datasets = DatasetDict({
    "train": train_testvalid["train"],
    "test": test_valid["test"],
    "valid": test_valid["train"]})


In [None]:
def prepare_dataset(batch):
    # 오디오 파일을 16kHz로 로드
    audio = batch["audio"]

    # input audio array로부터 log-Mel spectrogram 변환
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # target text를 label ids로 변환
    batch["labels"] = tokenizer(batch["Text"]).input_ids
    return batch

In [None]:
# 데이터 전처리 함수를 데이터셋 전체에 적용
low_call_voices = datasets.map(prepare_dataset, remove_columns=datasets.column_names["train"], num_proc=None)

In [None]:
from datasets import load_dataset
low_call_voices = datasets.load_from_disk("/content/drive/MyDrive/hi")

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
# 훈련시킬 모델의 processor, tokenizer, feature extractor 로드
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [None]:
# 데이터 콜레이터 초기화
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load('cer')

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # pad_token을 -100으로 치환
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # metrics 계산 시 special token들을 빼고 계산하도록 설정
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,  # epoch 대신 설정
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=True,
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=low_call_voices["train"],
    eval_dataset=low_call_voices["valid"],  # or "test"
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [None]:
trainer.train()

In [None]:
kwargs = {
    "dataset_tags": "https://huggingface.co/datasets/aoome123/important",
    "dataset": "important",  # a 'pretty' name for the training dataset
    "dataset_args": "config: ko, split: valid",
    "language": "ko",
    "model_name": "ft_model",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-base",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}


In [None]:
trainer.push_to_hub(**kwargs)

In [None]:
processor.push_to_hub("repo_name")
tokenizer.push_to_hub("repo_name")

In [None]:
# 파인 튜닝한 모델을 로드
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer

fine_model = WhisperForConditionalGeneration.from_pretrained("aoome123/repo_name")


fine_feature_extractor = WhisperFeatureExtractor.from_pretrained("aoome123/repo_name")
fine_tokenizer = WhisperTokenizer.from_pretrained("aoome123/repo_name", language="Korean", task="transcribe")
fine_processor = WhisperProcessor.from_pretrained("aoome123/repo_name")


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=False,
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=fine_model,
    train_dataset=low_call_voices["train"],
    eval_dataset=low_call_voices["valid"],  # for evaluation(not validation)
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=fine_processor.feature_extractor,  # 수정된 부분
)



In [None]:
trainer.evaluate() # 기존 whisper-base 모델

In [None]:
trainer.evaluate() # 파인 튜닝된 모델

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=fine_processor.feature_extractor,
    model=fine_model,
    padding='longest',
    pad_to_multiple_of=None
)


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=processor.feature_extractor,
    model=model,
    padding='longest',
    pad_to_multiple_of=None
)

In [None]:
from transformers import DataCollatorForSeq2Seq

small_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
small_processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Korean", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

small_data_collator = DataCollatorForSeq2Seq(
    tokenizer=small_processor.feature_extractor,  # fine_processor.feature_extractor 대신 tokenizer 사용
    model=small_model,  # fine_model 대신 base_model 사용
    padding='longest',
    pad_to_multiple_of=None
)


In [None]:
# small 모델
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=False,
)

tiny_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
tiny_trainer = Seq2SeqTrainer(
    model=tiny_model,
    args=training_args,
    eval_dataset=low_call_voices['test'],  # 평가 데이터셋
    compute_metrics=compute_metrics,  # 평가 메트릭 계산 함수
    tokenizer=small_processor,  # processor로 변경
    data_collator= small_data_collator # 데이터 패딩,
)


# tiny 모델 평가
tiny_eval_result = tiny_trainer.evaluate()
print(tiny_eval_result)

In [None]:
from transformers import DataCollatorForSeq2Seq

base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
base_processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
base_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

base_data_collator = DataCollatorForSeq2Seq(
    tokenizer=base_processor.feature_extractor,  # fine_processor.feature_extractor 대신 tokenizer 사용
    model=base_model,  # fine_model 대신 base_model 사용
    padding='longest',
    pad_to_multiple_of=None
)

In [None]:
# base 모델
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=False,
)

base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
base_trainer = Seq2SeqTrainer(
    model=base_model,
    args=training_args,
    eval_dataset=low_call_voices['test'],  # 평가 데이터셋
    compute_metrics=compute_metrics,  # 평가 메트릭 계산 함수
    tokenizer=base_processor,  # processor로 변경
    data_collator= base_data_collator # 데이터 패딩,
)


# base 모델 평가
base_eval_result = base_trainer.evaluate()
print(base_eval_result)

In [None]:
from transformers import DataCollatorForSeq2Seq

medium_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
medium_processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="Korean", task="transcribe")
medium_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")

medium_data_collator = DataCollatorForSeq2Seq(
    tokenizer=medium_processor.feature_extractor,
    model=medium_model,  # fine_model 대신 medium_model 사용
    padding='longest',
    pad_to_multiple_of=None
)

In [None]:
# medium 모델
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=False,
)

medium_trainer = Seq2SeqTrainer(
    model=medium_model,
    args=training_args,
    eval_dataset=low_call_voices['test'],  # 평가 데이터셋
    compute_metrics=compute_metrics,  # 평가 메트릭 계산 함수
    tokenizer=medium_processor,
    data_collator= medium_data_collator # 데이터 패딩,
)


# base 모델 평가
medium_eval_result = medium_trainer.evaluate()
print(medium_eval_result)

In [None]:
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainer
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer

In [None]:
fine_model = WhisperForConditionalGeneration.from_pretrained("aoome123/repo_name")
feature_extractor = WhisperFeatureExtractor.from_pretrained("aoome123/repo_name")
tokenizer = WhisperTokenizer.from_pretrained("aoome123/repo_name", language="Korean", task="transcribe")
fine_processor = WhisperProcessor.from_pretrained("aoome123/repo_name")

In [None]:
from transformers import WhisperForConditionalGeneration
from transformers import DataCollatorForSeq2Seq

fine_data_collator = DataCollatorForSeq2Seq(
    tokenizer=fine_processor.feature_extractor,  # fine_processor.feature_extractor 대신 tokenizer 사용
    model=fine_model,  # fine_model 대신 base_model 사용
    padding='longest',
    pad_to_multiple_of=None
)

In [None]:
# fine-tuning 모델
from transformers import WhisperTokenizer,  WhisperFeatureExtractor
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="repo_name",  # 원하는 리포지토리 이름을 임력한다.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # 배치 크기가 2배 감소할 때마다 2배씩 증가
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=False,
)

fine_model = WhisperForConditionalGeneration.from_pretrained("aoome123/repo_name")
fine_trainer = Seq2SeqTrainer(
    model=fine_model,
    args=training_args,
    eval_dataset=low_call_voices['test'],  # 평가 데이터셋
    compute_metrics=compute_metrics,  # 평가 메트릭 계산 함수
    tokenizer=fine_processor,
    data_collator= fine_data_collator # 데이터 패딩,
)


# fine_tuning 모델 평가
fine_eval_result = fine_trainer.evaluate()
print(fine_eval_result)