<a href="https://colab.research.google.com/github/jasial2/JapaneseTranscription/blob/main/DWT_Transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Added 'onnxruntime-gpu' to fix the VAD warning and speed up processing
!pip install whisper-timestamped srt ffmpeg-python tqdm onnxruntime-gpu

Collecting whisper-timestamped
  Downloading whisper_timestamped-1.15.9-py3-none-any.whl.metadata (1.4 kB)
Collecting srt
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting dtw-python (from whisper-timestamped)
  Downloading dtw_python-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (7.5 kB)
Collecting openai-whisper (from whisper-timestamped)
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyp

In [None]:
# --- STEP 2: SETUP & IMPORTS ---
import whisper_timestamped as whisper
import torch
import os
import srt
import datetime
import ffmpeg
import re
import sys
import warnings
from tqdm import tqdm

# --- FIX: SUPPRESS TORCH HUB WARNINGS ---
warnings.filterwarnings("ignore", category=UserWarning, module="torch.hub")

# --- MISSING PART: DEFINE DEVICE ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.

Using device: cuda


In [None]:
# Load Model
# Note: 'large-v3' is usually better for timestamps than 'turbo', but 'turbo' is faster.
print("Loading model...")
model = whisper.load_model("turbo", device=device)

Loading model...


100%|██████████████████████████████████████| 1.51G/1.51G [00:11<00:00, 145MiB/s]


In [None]:
# --- CONFIGURATION ---
input_video = "1234.mp3"  # <--- REPLACE THIS WITH YOUR FILE
output_srt = "output.srt"
language = "ja"

# --- HELPER: AUDIO PROCESSING ---
processed_audio = "temp_clean_audio.wav"

if not os.path.exists(input_video):
    raise FileNotFoundError(f"File '{input_video}' not found! Upload it to the Files tab.")

print(f"1. Optimizing audio for Voice Frequencies ({input_video})...")
try:
    # highpass=100: Removes rumble. lowpass=8000: Removes hiss.
    # ar=16000: Whisper requires 16kHz.
    (
        ffmpeg.input(input_video)
        .output(processed_audio, acodec="pcm_s16le", ac=1, ar="16000", af="highpass=f=100,lowpass=f=8000")
        .overwrite_output()
        .run(quiet=True)
    )
except ffmpeg.Error as e:
    print("FFmpeg error:", e.stderr)
    raise

# --- 2. RUN WHISPER-TIMESTAMPED (The Sync Fix) ---
print("2. Transcribing with Forced Alignment (DTW)...")

# Configuration for Sync Accuracy
result = whisper.transcribe(
    model,
    processed_audio,
    language=language,

    # --- SYNC SETTINGS ---
    beam_size=5,
    best_of=5,
    temperature=0.0,

    # CRITICAL: Forces alignment to audio waves, fixing the "out of sync" issue
    trust_whisper_timestamps=False,

    # Helps ignore silence/breathing so timestamps don't drift
    vad=False,

    # Detects hesitancy (uh, um) separately (optional, helps precision)
    detect_disfluencies=True,

    # Standard settings
    condition_on_previous_text=False,
    initial_prompt="うめき声や呼吸音を無視して、会話のみを書き起こしてください。"
)

# --- 3. ADVANCED FILTERING ---
print("3. Applying Japanese Garbage & Duration Filters...")

# A. Hallucination Triggers
hallucination_triggers = [
    "thank you for watching", "thanks for watching", "please subscribe",
    "subscribe", "sub by", "translated by", "amara", "viewing",
    "see you next", "bye", "the end", "like and", "follow me",
    "字幕", "視聴", "チャンネル", "登録", "高評価"
]

# B. Garbage Sounds
garbage_exact_matches = {
    # English
    "a", "aa", "ah", "ahh", "ha", "haa", "hah", "haha",
    "mm", "mmm", "hmm", "mh", "oh", "huh", "o", "m",
    "h", "eh", "uh", "uhh",
    # Japanese
    "あ", "ああ", "あっ", "あー",
    "ん", "んん", "んっ", "う", "うっ",
    "は", "はぁ", "はあ", "ふ", "ふぅ",
    "く", "くっ", "や", "いや", "お", "おっ"
}

final_subs = []
sub_index = 1

for segment in result["segments"]:
    text = segment["text"].strip()
    text_lower = text.lower()

    # 1. DURATION CHECK
    # Check strict duration to remove noise
    duration = segment["end"] - segment["start"]
    if duration < 0.4:
        continue

    # 2. HALLUCINATION CHECK
    if any(h in text_lower for h in hallucination_triggers):
        continue

    # 3. GARBAGE CHECK
    clean_text = re.sub(r'[^\w\s]', '', text_lower) # Remove punctuation
    words = clean_text.split()

    if len(words) == 0:
        continue

    # Check if ALL words are garbage
    is_pure_garbage = True
    for w in words:
        if w not in garbage_exact_matches:
            is_pure_garbage = False
            break

    if is_pure_garbage:
        continue

    # 4. REPETITION CHECK
    if len(words) > 4 and len(set(words)) == 1:
        continue

    # Add to subtitles
    final_subs.append(
        srt.Subtitle(
            index=sub_index,
            start=datetime.timedelta(seconds=segment["start"]),
            end=datetime.timedelta(seconds=segment["end"]),
            content=text
        )
    )
    sub_index += 1

# --- 4. SAVE FILE ---
with open(output_srt, "w", encoding="utf-8") as f:
    f.write(srt.compose(final_subs))

# Cleanup
if os.path.exists(processed_audio):
    os.remove(processed_audio)

print(f"------------------------------------------------")
print(f"✅ Optimization Complete.")
print(f"Saved: {output_srt}")

1. Optimizing audio for Voice Frequencies (1234.mp3)...
2. Transcribing with Forced Alignment (DTW)...


100%|██████████| 187317/187317 [02:52<00:00, 1088.41frames/s]


3. Applying Japanese Garbage & Duration Filters...
------------------------------------------------
✅ Optimization Complete.
Saved: output.srt
