<a href="https://colab.research.google.com/github/kth0522/AI_news/blob/main/voice_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**라이브러리 설치**

In [2]:
!pip install -U openai-whisper
!pip install pydub
!pip install pathlib



**코드**

In [3]:
import whisper
import csv
from pydub import AudioSegment
import os
from pathlib import Path

In [4]:
model = whisper.load_model("medium")

def process_audio_files(input_folder, transcription_folder, segments_folder):
    for audio_file_path in Path(input_folder).glob("*.wav"):
        file_name = audio_file_path.stem
        output_csv_path = Path(transcription_folder) / f"{file_name}.csv"
        output_segments_dir = Path(segments_folder) / file_name

        output_csv_path.parent.mkdir(parents=True, exist_ok=True)
        output_segments_dir.mkdir(parents=True, exist_ok=True)

        result = model.transcribe(str(audio_file_path))

        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(['id', 'start', 'end', 'text'])
            for segment in result['segments']:
                csvwriter.writerow([segment['id'], segment['start'], segment['end'], segment['text']])

        audio = AudioSegment.from_wav(str(audio_file_path))

        for segment in result["segments"]:
            start = int(segment["start"] * 1000)
            end = int(segment["end"] * 1000)
            cropped = audio[start:end]
            cropped.export(output_segments_dir / f"{segment['id']}.wav", format="wav")


100%|█████████████████████████████████████| 1.42G/1.42G [00:16<00:00, 90.3MiB/s]


**실행**

In [6]:
# 입력 폴더
INPUT_DIR = './vocal_samples'

# transcription 결과 저장 폴더
TRANSCRIPTION_DIR = './transcriptions'

# voice segment 결과 저장 폴더
SEGMENTS_DIR = './segments'

In [7]:
process_audio_files(input_folder=INPUT_DIR, transcription_folder=TRANSCRIPTION_DIR, segments_folder=SEGMENTS_DIR)

