In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-base')

In [None]:
import torch

print("CUDA 사용 가능 여부:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU 이름:", torch.cuda.get_device_name(0))
    print("GPU 개수:", torch.cuda.device_count())
    print


In [None]:
from transformers import WhisperTokenizer
# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [None]:
input_str = "저는 서울중앙지검 지능범죄수사팀 최인호 검사입니다."
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Korean", task="transcribe")

In [9]:
import os
import librosa

def get_total_duration_recursive(base_dir, extension=".wav"):
    total_duration = 0.0

    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                file_path = os.path.join(root, file)
                try:
                    y, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=y, sr=sr)
                    total_duration += duration
                except Exception as e:
                    print(f"파일 오류: {file_path} → {e}")
    return total_duration

# 경로 지정: audio/KsponSpeech_01
base_path = "./data/audio/KsponSpeech_01"
total_seconds = get_total_duration_recursive(base_path)
print(f"총 오디오 길이: {total_seconds/60:.2f} 분 ({total_seconds:.2f} 초), {total_seconds/60/60:.2f} 시간")


총 오디오 길이: 11583.26 분 (694995.49 초), 193.05 시간


### Main Reference : https://huggingface.co/datasets/google/fleurs
### Sub Reference  : https://huggingface.co/blog/audio-datasets

- You need to watch sub reference

In [None]:
from datasets import load_dataset
fleurs = load_dataset("google/fleurs", "ko_kr", split="train")


In [None]:
from datasets import load_dataset
dataset = load_dataset("parquet", data_files=r"D:\Whisper\data\fleurs\test-00000-of-00006.parquet")

In [None]:
import re
unique_lst = ['o/', 'b/', 'l/','/n','+']
sentence = 'o/ (그러니까)/(그니까*). 상관 없어 그러면.'

if any(tag in sentence for tag in unique_lst):
    included_tags = [tag for tag in unique_lst if tag in sentence]
    for tag in included_tags:
        sentence = sentence.replace(tag,'')
# 2. 괄호 쌍 구조가 있을 때만 처리
if re.search(r'\([^)]+\)/\([^)]+\)', sentence):

    groups = re.findall(r'\(([^)]+)\)/\([^)]+\)', sentence)
    sentence = re.sub(r'\([^)]+\)/\([^)]+\)', '', sentence)

    # 2-3. 단일 괄호도 제거 (혹시 남아있을 경우)
    sentence = re.sub(r'\([^)]+\)', '', sentence)

    # 2-4. 특수문자 제거
    sentence = re.sub(r'[^\w가-힣\s]', '', sentence)

    # 2-5. 앞에 추출된 그룹 붙이기
    final_sentence = ' '.join(groups) + ' ' + sentence.strip()
else:
    # 괄호 구조 없으면 단순히 특수문자만 제거
    sentence = re.sub(r'[^\w가-힣\s]', '', sentence)
    final_sentence = sentence.strip()

print(final_sentence)

In [None]:
import re

sentence = '(그러니까)/(그니까*). 상관 없어 그러면.'

# 1. 괄호 그룹 찾기 (예: (그러니까)/(그니까*))
groups = re.findall(r'\(([^)]+)\)/\([^)]+\)', sentence)
# print(groups)  # ['그러니까'] 만 남겨짐

# 2. 괄호 블록 전체 제거
sentence = re.sub(r'\([^)]+\)/\([^)]+\)', '', sentence)

# 3. 괄호 1개짜리도 제거 (혹시 남아있을 경우)
sentence = re.sub(r'\([^)]+\)', '', sentence)

# 4. 특수문자 제거 (여기선 . 만)
sentence = re.sub(r'[^\w가-힣\s]', '', sentence)

# 5. 정제된 표현 붙이기
final = ' '.join(groups) + ' ' + sentence.strip()

print('최종 문장:', final)


In [None]:
unique_lst = ['o/', 'b/', 'l/', '/n', '+', '/', 'n', 'u', '*']
pattern = '|'.join(map(re.escape, unique_lst))  # 정규식 패턴 생성

print(pattern)

In [None]:
import re
target_file = r'D:\Whisper\data\info\train_KsponSpeech_01_test.csv'
unique_lst = ['o/', 'b/', 'l/', '/n', '+', 'n', 'u', '*',]
pattern = '|'.join(map(re.escape, unique_lst))  # 정규식 패턴 생성
with open(target_file, 'rt', encoding='utf-8') as f:
            data = f.readlines()
if target_file.endswith('.csv'):
    header = data[:1]
    lines = data[1:]
else:
    header = []
    lines = data

for i in lines:
    path, original_sentence = i.split(',',1)
    original_sentence = original_sentence.strip().replace('"', '')
    original_sentence = re.sub(pattern, '', original_sentence)

    if re.search(r'\([^)]+\)/\([^)]+\)', original_sentence):
                groups = re.findall(r'\(([^)]+)\)/\([^)]+\)', original_sentence)
                sentence = re.sub(r'\([^)]+\)/\([^)]+\)', '', original_sentence)
                final_sentence = ' '.join(groups) + ' ' + sentence.strip()
                
    else:
        final_sentence = original_sentence.strip()

    final_sentence = re.sub(r'\([^)]+\)\([^)]+\)', lambda m: m.group(0).split(')(')[0] + ')', final_sentence)
    final_sentence = re.sub(r'\(([^()]+)\)', r'\1', final_sentence)
    
    
    print(final_sentence)

In [8]:
import re
target_file = r'D:\Whisper\data\info\train_KsponSpeech_01_test.csv'
unique_lst = ['o/', 'b/', 'l/', '/n', '+', 'n', 'u', '*','/',')']
all_pattern = '|'.join(map(re.escape, unique_lst))  # 정규식 패턴 생성
print(all_pattern)

with open(target_file, 'rt', encoding='utf-8') as f:
            data = f.readlines()
if target_file.endswith('.csv'):
    header = data[:1]
    lines = data[1:]

new_sentences = []
for i in lines:
    path, original_sentence = i.split(',',1)
    original_sentence = original_sentence.strip().replace('"','')
    sub_pattern = r'\(([^/]+)/[^)]*\)' # 앞에 것만 남기기기
    result = re.sub(sub_pattern, r'\1', original_sentence)
    final_sentence = re.sub(all_pattern,'', result)
    final_sentence = re.sub('  ',' ',final_sentence)
    final_sentence = final_sentence.strip()
    final_sentence = f'"{final_sentence}"'


    new_sentences.append(f"{path},{final_sentence}\n")

print(new_sentences[262:263])
    # with open(target_file, 'wt', encoding='utf-8') as f:
    #     if header:
    #         f.writelines(header)
    #     f.writelines(new_sentences)






o/|b/|l/|/n|\+|n|u|\*|/|\)
['D:\\Whisper\\data/audio\\KsponSpeech_01/KsponSpeech_0002/KsponSpeech_001270.wav,"작년 아 생각도 하기 싫어 이전을 생각하면 진짜 아침 8시부터 밤 10시까지 어떻게 학원에 있었나 싶다니까"\n']


In [1]:
import pandas as pd

try:
    df = pd.read_csv('data/info/train_KsponSpeech_01_train.csv')
    print("Train CSV OK:", df.shape)
except Exception as e:
    print("Train CSV Error:", e)

try:
    df = pd.read_csv('data/info/train_KsponSpeech_01_test.csv')
    print("Valid CSV OK:", df.shape)
except Exception as e:
    print("Valid CSV Error:", e)

try:
    df = pd.read_csv('data/info/fleurs_transcription_test.csv')
    print("Test CSV OK:", df.shape)
except Exception as e:
    print("Test CSV Error:", e)


Train CSV OK: (99200, 2)
Valid CSV OK: (24800, 2)
Test CSV OK: (382, 2)
