In [1]:
import os
bin_path = r"D:\Study\GPT_AGENT_2025_BOOK\ffmpeg-2025-11-17-git-e94439e49b-full_build\bin"
os.environ["PATH"] = bin_path + os.pathsep + os.environ.get("PATH", "")
# 확인
import shutil
print("shutil.which('ffmpeg') ->", shutil.which("ffmpeg"))

shutil.which('ffmpeg') -> D:\Study\GPT_AGENT_2025_BOOK\ffmpeg-2025-11-17-git-e94439e49b-full_build\bin\ffmpeg.EXE


In [16]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import json, pprint, os

# gtx-1660ti (6GB VRAM) 그래픽 카드의 경우 GPU에서 float16 연산을 해보니 NaN이 나와서 변환 출력이 나오지 않음.
# 어쩔수 없이 float32로 변경
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # 이 부분은 포기
torch_dtype = torch.float32
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)

# Move model to device if available (try GPU, fallback to CPU on OOM)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
try:
    model.to(device)
    print(f"Moved model to {device}")
except Exception as e:
    # Likely CUDA OOM or other accelerator error: fallback to CPU
    import torch as _torch
    print('Warning: failed to move model to GPU (falling back to CPU). Exception:', e)
    try:
        _torch.cuda.empty_cache()
    except Exception:
        pass
    device_idx = -1
    model.to('cpu')

processor = AutoProcessor.from_pretrained(model_id)

# Create pipeline: pass processor for safety, and integer device index
pipe = pipeline("automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                processor=processor,
                dtype=torch_dtype,
                device = device_idx,
                return_timestamps=True,
                chunk_length_s=10,
                stride_length_s=2,
                )

sample = "../audio/lsy_audio_2023_58s.mp3"
result = pipe(sample)

# Pretty-print result to notebook output
pprint.pprint(result)

# Save result to JSON and text files under chap05/sec02/output if possible
# out_dir = os.path.join('chap05','sec02','output')
out_dir = os.path.join('output')  
os.makedirs(out_dir, exist_ok=True)
json_path = os.path.join(out_dir, 'whisper_result.json')
txt_path = os.path.join(out_dir, 'whisper_result.txt')

# Try to write JSON; fall back to str() if non-serializable
try:
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print('Saved JSON to', json_path)
except TypeError:
    with open(json_path, 'w', encoding='utf-8') as f:
        f.write(str(result))
    print('Saved str(result) to', json_path)

# Extract plain transcription text if available and save
texts = []
if isinstance(result, dict) and 'text' in result:
    texts.append(result['text'])
elif isinstance(result, list):
    for r in result:
        if isinstance(r, dict) and 'text' in r:
            texts.append(r['text'])

if texts:
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(texts))
    print('Saved transcription text to', txt_path)
else:
    print('No plain transcription text found in result.')

Search for `cudaErrorMemoryAllocation' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



Device set to use cpu


{'chunks': [{'text': ' 안녕하세요. 이 강의는 GPT-API로 챗봇 만들기 라는 내용을 다루는 강의입니다.',
             'timestamp': (0.0, 6.3)},
            {'text': ' GPT-API에 대해서 생소하신 분들도 있을텐데', 'timestamp': (7.18, 10.0)},
            {'text': ' 우리가 잘 알고 있는 ChatGPT, ChatGPT 기능을 이용해서',
             'timestamp': (11.0, 17.0)},
            {'text': ' 우리가 원하는 프로그램을 어떻게 만드는지에 대해서', 'timestamp': (17.0, 20.0)},
            {'text': ' 이야기할 거예요.', 'timestamp': (20.0, 22.0)},
            {'text': ' 그래서 이런 강의들이 사실 많이 있습니다.', 'timestamp': (22.0, 24.0)},
            {'text': ' 그래서 여러가지들이 있는데 이 강의 특징이라고 한다면',
             'timestamp': (24.0, 27.48)},
            {'text': ' GPT로 명확한 미션을 달성하는', 'timestamp': (27.48, 29.58)},
            {'text': ' 챕터 프로그램을 만드는게 사실', 'timestamp': (29.58, 31.66)},
            {'text': ' 쉽지는 않은데 이걸 어떻게 해서', 'timestamp': (31.66, 34.32)},
            {'text': ' 구현을 하는지 그리고 그게 왜 필요한지에 대해서', 'timestamp': (34.32, 36.4)},
            {'text': ' 좀 이야기를 할 거고요.', 'timestamp': (36.4, 37.34)},
            {'text':

In [17]:
# Create SRT (.srt) from the pipeline result.
# This cell expects a variable named `result` in the notebook namespace.
import os
out_dir = os.path.join('output')
os.makedirs(out_dir, exist_ok=True)
srt_path = os.path.join(out_dir, 'whisper_result.srt')

def _to_srt_time(t):
    # t in seconds (float) -> SRT timestamp HH:MM:SS,mmm
    ms = int(round((t - int(t)) * 1000))
    h = int(t // 3600)
    m = int((t % 3600) // 60)
    s = int(t % 60)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

segments = []

# Normalize various result formats (dict with 'chunks'/'segments' or list of dicts)
def _extract_from_obj(obj):
    segs = []
    if not obj:
        return segs
    if isinstance(obj, dict):
        if 'chunks' in obj and obj['chunks']:
            for ch in obj['chunks']:
                start = ch.get('start') or ch.get('begin') or None
                end = ch.get('end') or ch.get('finish') or None
                if start is None or end is None:
                    ts = ch.get('timestamp')
                    if isinstance(ts, (list, tuple)) and len(ts) >= 2:
                        start, end = float(ts[0]), float(ts[1])
                text = ch.get('text') or ch.get('chunk_text') or ''
                if start is None:
                    start = 0.0
                if end is None:
                    end = start + 1.0
                segs.append({'start': float(start), 'end': float(end), 'text': str(text).strip()})
        elif 'segments' in obj and obj['segments']:
            for s in obj['segments']:
                start = s.get('start', 0.0)
                end = s.get('end', start + 1.0)
                text = s.get('text', '')
                segs.append({'start': float(start), 'end': float(end), 'text': str(text).strip()})
    return segs

if isinstance(result, dict):
    segments.extend(_extract_from_obj(result))
elif isinstance(result, list):
    for item in result:
        segments.extend(_extract_from_obj(item))

if not segments and isinstance(result, dict) and result.get('text'):
    segments.append({'start': 0.0, 'end': max(1.0, len(result.get('text',''))/15.0), 'text': result.get('text','').strip()})

# Merge adjacent/overlapping segments and filter out empty text
merged = []
for seg in sorted([s for s in segments if s.get('text')], key=lambda x: x['start']):
    if not merged:
        merged.append(seg)
        continue
    last = merged[-1]
    # if overlap or very short gap, merge into last
    if seg['start'] <= last['end'] + 0.050:
        last['end'] = max(last['end'], seg['end'])
        last['text'] = (last['text'] + ' ' + seg['text']).strip()
    else:
        merged.append(seg)

# Write SRT file
with open(srt_path, 'w', encoding='utf-8') as f:
    for i, seg in enumerate(merged, start=1):
        start_ts = _to_srt_time(seg['start'])
        end_ts = _to_srt_time(seg['end'])
        text = seg['text'].replace('\n', ' ').strip()
        f.write(f"{i}\n{start_ts} --> {end_ts}\n{text}\n\n")

print('Wrote SRT to', srt_path)
print('SRT segments written:', len(merged))

Wrote SRT to output\whisper_result.srt
SRT segments written: 6
