In [1]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-base')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [2]:
import torch

print("CUDA 사용 가능 여부:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU 이름:", torch.cuda.get_device_name(0))
    print("GPU 개수:", torch.cuda.device_count())
    print


CUDA 사용 가능 여부: True
GPU 이름: NVIDIA RTX 6000 Ada Generation
GPU 개수: 1


In [3]:
from transformers import WhisperTokenizer
# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [4]:
input_str = "저는 서울중앙지검 지능범죄수사팀 최인호 검사입니다."
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 저는 서울중앙지검 지능범죄수사팀 최인호 검사입니다.
Decoded w/ special:    <|startoftranscript|><|ko|><|transcribe|><|notimestamps|>저는 서울중앙지검 지능범죄수사팀 최인호 검사입니다.<|endoftext|>
Decoded w/out special: 저는 서울중앙지검 지능범죄수사팀 최인호 검사입니다.
Are equal:             True


In [5]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
import os
import librosa

def get_total_duration_recursive(base_dir, extension=".wav"):
    total_duration = 0.0

    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                file_path = os.path.join(root, file)
                try:
                    y, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=y, sr=sr)
                    total_duration += duration
                except Exception as e:
                    print(f"파일 오류: {file_path} → {e}")
    return total_duration

# 경로 지정: audio/KsponSpeech_01
base_path = "./data/audio/KsponSpeech_01"
total_seconds = get_total_duration_recursive(base_path)
print(f"총 오디오 길이: {total_seconds/60:.2f} 분 ({total_seconds:.2f} 초), {total_seconds/60/60:.2f} 시간")


총 오디오 길이: 11583.26 분 (694995.49 초), 193.05


### Main Reference : https://huggingface.co/datasets/google/fleurs
### Sub Reference  : https://huggingface.co/blog/audio-datasets

- You need to watch sub reference

In [33]:
from datasets import load_dataset
fleurs = load_dataset("google/fleurs", "ko_kr", split="train")
