<a href="https://colab.research.google.com/github/jasial2/JapaneseTranscription/blob/main/Demucs_%2C_BS_Roformer_%2B_DWT_Transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title üõ†Ô∏è Step 1: Smart Installation (Select Engine)

import subprocess
import sys
import os

# ==========================================
# üéõÔ∏è INSTALL CONFIGURATION
# ==========================================

# Choose which engine you intend to use in Step 2.
# "Demucs Only": Fast installation. Best for quiet/conversational JAV.
# "BS-Roformer Only": Slower install, heavier. Best for music/noise removal.
# "Install Both": Installs everything (Takes longest).
install_mode = "BS-Roformer Only" # @param ["Demucs Only", "BS-Roformer Only", "Install Both"]

# ==========================================
# üöÄ INSTALLATION SCRIPT
# ==========================================

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print(f"‚è≥ Initializing Setup for: {install_mode}...")

try:
    # 1. ALWAYS INSTALL: Common Base Tools (Required for Step 3 & FFmpeg)
    print("   -> Installing Core Tools (FFmpeg, Whisper, SRT)...")
    install("ffmpeg-python")
    install("soundfile")
    install("yt-dlp")
    install("whisper-timestamped")
    install("srt")
    install("tqdm")

    # 2. CONDITIONAL INSTALL: Demucs
    if "Demucs" in install_mode or "Both" in install_mode:
        print("   -> Installing Demucs...")
        install("demucs")

    # 3. CONDITIONAL INSTALL: BS-Roformer (UVR5)
    if "Roformer" in install_mode or "Both" in install_mode:
        print("   -> Installing BS-Roformer (Audio Separator)...")
        # This installs the GPU-accelerated version specifically
        install("audio-separator[gpu]")

    print("‚úÖ Installation Complete.")
    print(f"   ‚ÑπÔ∏è  Ready to use {install_mode} in Step 2.")

except Exception as e:
    print(f"‚ùå Installation Failed: {e}")

‚è≥ Initializing Setup for: BS-Roformer Only...
   -> Installing Core Tools (FFmpeg, Whisper, SRT)...
   -> Installing BS-Roformer (Audio Separator)...
‚úÖ Installation Complete.
   ‚ÑπÔ∏è  Ready to use BS-Roformer Only in Step 2.


In [None]:
# @title üßπ Step 2: Dual-Engine Audio Mastering (Demucs / BS-Roformer)

import os
import torch
import gc
import ffmpeg
import logging
import subprocess
import sys
import soundfile as sf
import numpy as np

# ==========================================
# üéõÔ∏è MASTER CONFIGURATION
# ==========================================

input_filename = "1234.mp3" # @param {type:"string"}
final_output_filename = "1234_mastered.wav" # @param {type:"string"}

# --- ENGINE SELECTION ---
# "BS-Roformer": Best for removing music/loud noise. Surgical precision.
# "Demucs (HT)": Best for quiet rooms/conversations. Warmer, more natural.
processing_engine = "BS-Roformer (ViperX)" # @param ["BS-Roformer (ViperX)", "Demucs (HT)"]

# --- [DEMUCS ONLY] SETTINGS ---
# "Fast": Good for quick checks. "High Precision": Best for complex overlap.
demucs_quality = "High Precision (4 Shifts)" # @param ["Fast (1 Shifts)", "Standard (2 Shifts)", "High Precision (4 Shifts)"]

# --- [ROFORMER ONLY] SETTINGS ---
# "ViperX-1297": The SOTA model. "MDX23C": Faster fallback.
roformer_model = "ViperX-1297 (Best Quality)" # @param ["ViperX-1297 (Best Quality)", "MDX23C (Fast)"]

# --- MASTERING (APPLIES TO BOTH) ---
# "Conversation (LRA 7)": Flattens volume so quiet actress = loud cameraman. (BEST FOR AI)
# "Balanced (LRA 9)": A middle ground.
# "Natural (LRA 11)": Broadcast standard. Keeps dynamics.
audio_profile = "Conversation (LRA 7) - Best for AI" # @param ["Conversation (LRA 7) - Best for AI", "Balanced (LRA 9)", "Natural (LRA 11) - Broadcast Standard"]

# ==========================================
# ‚öôÔ∏è LOGIC PARSER (FIXED)
# ==========================================

# 1. Output Cleanup
if not final_output_filename.endswith(".wav"):
    final_output_filename += ".wav"
base_name = os.path.splitext(os.path.basename(input_filename))[0]
temp_vocal_path = f"temp_{base_name}_vocals.wav"

# 2. Parse LRA (Fixed Logic)
if "Conversation" in audio_profile:
    target_lra = 7
elif "Balanced" in audio_profile:
    target_lra = 9
else:
    target_lra = 11

# ==========================================
# üöÄ MAIN EXECUTION
# ==========================================

def run_demucs(in_file, out_file, quality_mode):
    print(f"\nüîπ ENGINE: Running Demucs (HTDemucs)...")
    from demucs.pretrained import get_model
    from demucs.apply import apply_model
    import torchaudio

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Parse Quality (Fixed Logic)
    if "Fast" in quality_mode:
        shifts = 1
    elif "Standard" in quality_mode:
        shifts = 2
    else:
        shifts = 4 # High Precision

    try:
        model = get_model("htdemucs")
        model.to(device)

        print("   -> Loading audio...")
        wav_np, sr = sf.read(in_file)
        wav = torch.from_numpy(wav_np).float()

        if len(wav.shape) == 1: wav = wav.unsqueeze(0)
        else: wav = wav.t()

        if sr != 44100:
            resampler = torchaudio.transforms.Resample(sr, 44100)
            wav = resampler(wav)

        ref = wav.mean(0)
        wav = (wav - ref.mean()) / ref.std()
        wav = wav.unsqueeze(0).to(device)

        print(f"   -> Separating (Shifts={shifts})...")
        sources = apply_model(model, wav, shifts=shifts, split=True, overlap=0.25, progress=True)
        vocals = sources[0, 3].cpu().numpy()

        sf.write(out_file, vocals.T, 44100)

        del model
        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print(f"‚ùå Demucs Error: {e}")
        raise

def run_roformer(in_file, out_file, model_mode):
    print(f"\nüîπ ENGINE: Running BS-Roformer (Audio Separator)...")
    from audio_separator.separator import Separator

    # Select Model File
    if "ViperX" in model_mode:
        model_filename = "model_bs_roformer_ep_317_sdr_12.9755.ckpt"
    else:
        model_filename = "UVR-MDX-NET-Inst_HQ_3.onnx"

    try:
        # Initialize
        separator = Separator(
            log_level=logging.ERROR,
            model_file_dir="/content/audio-separator-models/",
            output_dir="/content/",
            output_single_stem="vocals"
        )

        print(f"   -> Loading Model: {model_filename}")
        separator.load_model(model_filename=model_filename)

        print(f"   -> Inference...")
        output_files = separator.separate(in_file)

        # Rename output to temp path
        generated_file = output_files[0]
        if os.path.exists(generated_file):
            if os.path.exists(out_file): os.remove(out_file)
            os.rename(generated_file, out_file)
        else:
            raise FileNotFoundError("Roformer did not output a file.")

    except Exception as e:
        print(f"‚ùå Roformer Error: {e}")
        raise

# --- MEMORY SAFETY CHECK ---
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

if not os.path.exists(input_filename):
    print(f"‚ùå Error: File '{input_filename}' not found!")
else:
    print(f"üöÄ Phase 1: Processing {input_filename}")
    print(f"   ‚öôÔ∏è Engine: {processing_engine}")
    print(f"   ‚öôÔ∏è Profile: {audio_profile} (LRA {target_lra})")

    # --- 1. RUN SEPARATION ---
    try:
        if "Demucs" in processing_engine:
            run_demucs(input_filename, temp_vocal_path, demucs_quality)
        else:
            run_roformer(input_filename, temp_vocal_path, roformer_model)

        print(f"   ‚úÖ Vocal Isolation Complete.")

    except Exception as e:
        print(f"‚ùå Critical Separation Error: {e}")
        raise

    # --- 2. RUN MASTERING (LOUDNORM) ---
    print(f"\nüîπ [Phase 2] FFmpeg Broadcast Mastering...")
    try:
        # FILTER EXPLANATION:
        # highpass=90: Changed from 100 to 90 to be safer for deep male voices.
        # loudnorm: Normalizes volume. LRA uses the variable we parsed above.
        (
            ffmpeg.input(temp_vocal_path)
            .output(
                final_output_filename,
                acodec="pcm_s16le", ac=1, ar="16000",
                af=f"highpass=f=90,lowpass=f=8000,loudnorm=I=-16:TP=-1.5:LRA={target_lra}"
            )
            .overwrite_output()
            .run(quiet=True)
        )
        print(f"   ‚úÖ Mastered File Ready: {final_output_filename}")

        # Cleanup
        if os.path.exists(temp_vocal_path):
            os.remove(temp_vocal_path)

    except ffmpeg.Error as e:
        print("   ‚ùå FFmpeg Error:", e.stderr)
        raise

üöÄ Phase 1: Processing 1234.mp3
   ‚öôÔ∏è Engine: BS-Roformer (ViperX)
   ‚öôÔ∏è Profile: Conversation (LRA 7) - Best for AI (LRA 7)

üîπ ENGINE: Running BS-Roformer (Audio Separator)...
   -> Loading Model: model_bs_roformer_ep_317_sdr_12.9755.ckpt


28.3kiB [00:00, 29.3MiB/s]                  
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 639M/639M [00:06<00:00, 105MiB/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.27k/2.27k [00:00<00:00, 3.97MiB/s]


   -> Inference...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 240/240 [12:13<00:00,  3.06s/it]


   ‚úÖ Vocal Isolation Complete.

üîπ [Phase 2] FFmpeg Broadcast Mastering...
   ‚úÖ Mastered File Ready: 1234_mastered.wav


In [None]:
# @title üßπ Step 2.5: Aggressive GPU Memory Nuke (Essential for BS-Roformer)

import torch
import gc
import sys

def get_gpu_memory():
    """Returns (allocated_mb, reserved_mb)"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        return allocated, reserved
    return 0, 0

print("üõë INITIATING DEEP GPU CLEANING CYCLE...")

# --- 1. TARGETED ASSASSINATION OF VARIABLES ---
# We explicitly hunt down the heavy objects from Step 2
target_vars = [
    'separator',      # BS-Roformer Object
    'model',          # Demucs/Roformer Model
    'wav',            # Audio Tensor
    'sources',        # Demucs Output
    'vocals',         # Raw Vocals
    'resampler',      # Torchaudio resampler
    'demucs',         # Demucs module alias
    'wav_np'          # Numpy Audio
]

deleted_count = 0
for var in target_vars:
    if var in globals():
        print(f"   üî´ Killing variable: {var}")
        del globals()[var]
        deleted_count += 1

if deleted_count == 0:
    print("   ‚ÑπÔ∏è  No heavy variables found in global scope (Clean slate).")

# --- 2. THE THREE-STAGE FLUSH LOOP ---
# We loop because sometimes Python's GC needs multiple passes to catch cyclic references
max_retries = 3
clean_success = False

for i in range(1, max_retries + 1):
    print(f"\n   üîÑ [Cycle {i}/{max_retries}] Flushing Cache...")

    # 1. Force Python Garbage Collection (All Generations)
    gc.collect()

    # 2. Clear PyTorch CUDA Cache
    torch.cuda.empty_cache()

    # 3. Clear IPC (Inter-Process Communication) - often forgotten
    torch.cuda.ipc_collect()

    # Verification
    allocated, reserved = get_gpu_memory()
    print(f"      üìâ Current Status -> Allocated: {allocated:.2f}MB | Reserved: {reserved:.2f}MB")

    # Threshold: If we are under 1000MB reserved, we are effectively empty
    if reserved < 1000:
        clean_success = True
        break

# --- 3. FINAL VERDICT ---
print("\n" + "="*30)
final_alloc, final_reserved = get_gpu_memory()

if clean_success or final_reserved < 1000:
    print(f"‚úÖ GPU CLEAN SUCCESSFUL")
    print(f"   üß† VRAM Available for Whisper: ~14GB (on T4)")
    print(f"   üìä Final 'Junk' Usage: {final_reserved:.2f}MB (Negligible)")
else:
    print(f"‚ö†Ô∏è GPU CLEAN WARNING: PERSISTENT MEMORY DETECTED")
    print(f"   üìä VRAM still holding: {final_reserved:.2f}MB")
    print("   üõë SUGGESTION: If Step 3 crashes, go to 'Runtime' -> 'Restart Session' and skip Step 2.")
print("="*30)

üõë INITIATING DEEP GPU CLEANING CYCLE...
   ‚ÑπÔ∏è  No heavy variables found in global scope (Clean slate).

   üîÑ [Cycle 1/3] Flushing Cache...
      üìâ Current Status -> Allocated: 9.12MB | Reserved: 22.00MB

‚úÖ GPU CLEAN SUCCESSFUL
   üß† VRAM Available for Whisper: ~14GB (on T4)
   üìä Final 'Junk' Usage: 22.00MB (Negligible)


In [None]:
# @title üß† Step 3.1: Load Whisper Model (Smart Selection)

import whisper_timestamped as whisper
import torch
import warnings
import gc

# ==========================================
# üéõÔ∏è MODEL SELECTION GUIDE
# ==========================================

# Choose based on your video content:
# 1. v3: Has the best vocabulary for Story/Plot. BUT it often hallucinates text during moaning scenes.
# 2. v2: OLDER BUT SAFER. It ignores breathing/moaning better. Use this if v3 gives you garbage.
# 3. Turbo: Very fast, but lower accuracy. Good for checking sync.
model_variant = "large-v3 (Best for Story/Plot - High Vocab)" # @param ["large-v3 (Best for Story/Plot - High Vocab)", "large-v2 (Stable - Best for Heavy Breathing/Moans)", "large-v3-turbo (Fastest - Lower Accuracy)"]

# ==========================================
# üöÄ LOAD LOGIC
# ==========================================

# Parse the user's choice to get the actual model name
model_size = model_variant.split()[0]

warnings.filterwarnings("ignore", category=UserWarning, module="torch.hub")
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"üöÄ Initializing Whisper...")
print(f"   ‚ÑπÔ∏è  Selected Profile: {model_variant}")

try:
    # 1. Check if model is already loaded
    if 'loaded_model' in globals():
        print(f"‚ö†Ô∏è A model is already loaded in memory.")
        print(f"   To switch models, you must Restart Runtime (Runtime -> Restart Session).")
        print(f"   Otherwise, proceed to Step 3.2.")

    else:
        # 2. Memory Safety Check
        if torch.cuda.is_available():
            free_mem = torch.cuda.mem_get_info()[0] / 1024**3
            print(f"   üß† VRAM Available: {free_mem:.2f} GB")
            if free_mem < 4.0:
                print("   ‚ö†Ô∏è WARNING: VRAM is critically low. Did you run Step 2.5?")

        # 3. Load the Model
        print(f"   ‚è≥ Downloading & Loading '{model_size}'... (This happens once)")
        loaded_model = whisper.load_model(model_size, device=device)

        print(f"‚úÖ Model Loaded Successfully!")

        # 4. Specific Advice based on selection
        if "large-v3" in model_size:
            print("   üí° TIP: You chose v3. If you see 'Thank you for watching' loops,")
            print("      increase the 'logprob_threshold' in Step 3.2.")
        elif "large-v2" in model_size:
            print("   üí° TIP: You chose v2. It is very stable for JAV.")
            print("      It might miss some whispery dialects, but it won't hallucinate as much.")

        print("‚¨áÔ∏è  Proceed to Step 3.2.")

except Exception as e:
    print(f"‚ùå Failed to load model: {e}")
    print("   üí° Tip: If OutOfMemory error, restart runtime and skip Step 2 (assuming audio is saved).")

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.

üöÄ Initializing Whisper...
   ‚ÑπÔ∏è  Selected Profile: large-v3 (Best for Story/Plot - High Vocab)
   üß† VRAM Available: 14.59 GB
   ‚è≥ Downloading & Loading 'large-v3'... (This happens once)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.88G/2.88G [00:57<00:00, 54.1MiB/s]


‚úÖ Model Loaded Successfully!
   üí° TIP: You chose v3. If you see 'Thank you for watching' loops,
      increase the 'logprob_threshold' in Step 3.2.
‚¨áÔ∏è  Proceed to Step 3.2.


In [None]:
# @title üìù Step 3.2: Transcribe & Download (Fixed & Robust)

import srt
import datetime
import re
import os
import torch
import gc
from google.colab import files

# ==========================================
# üéõÔ∏è USER CONFIGURATION
# ==========================================

# --- FILES ---
audio_file_to_transcribe = "1234_mastered.wav" # @param {type:"string"}
output_srt = "1234.srt" # @param {type:"string"}

# --- EXTERNAL VAD (Silero) ---
# [TRUE] = "Hard Mode". Cuts non-speech audio before Whisper hears it.
#          Fixes hallucinations but might cut soft "Ya..." sounds.
# [FALSE] = "Soft Mode". Relies on Whisper's internal logic. Safer for JAV.
enable_silero_vad = False # @param {type:"boolean"}

vad="False" # @param ["True", "False"]

# --- WHISPER SENSITIVITY (Internal Logic) ---
# "High Sensitivity": Captures whispers/breathing (Effectively VAD=OFF).
# "Balanced": Standard behavior.
# "Strict": Ignores ambiguous sounds (Effectively VAD=ON).
internal_sensitivity = "High Sensitivity (Plot/Whispers)" # @param ["High Sensitivity (Plot/Whispers)", "Balanced (Standard)", "Strict (Action/No Moans)"]

# --- HALLUCINATION GUARD ---
# "Strict": Deletes text if confidence is low. Prevents loops.
hallucination_guard = "Strict (Anti-Loop)" # @param ["Strict (Anti-Loop)", "Permissive (Allow Mumbling)"]

# --- GENRE CONTEXT ---
genre_context = "Standard JAV (Casual/Conversational)" # @param ["Standard JAV (Casual/Conversational)", "Hardcore (More yelling/moaning)", "Dialect/Kansai (Regional Speech)"]

# ==========================================
# ‚öôÔ∏è LOGIC PARSER & SAFETY CHECKS
# ==========================================

# Map string 'vad' parameter to boolean 'use_silero_vad'
use_silero_vad = True if vad == "True" else False

# 1. Parse Internal Sensitivity (no_speech_threshold)
if "High" in internal_sensitivity:
    speech_threshold = 0.25
elif "Strict" in internal_sensitivity:
    speech_threshold = 0.6
else:
    speech_threshold = 0.4

# 2. Parse Hallucination Guard (logprob_threshold)
if "Strict" in hallucination_guard:
    logprob_cutoff = -0.7
else:
    logprob_cutoff = -1.0

# 3. Build Prompt
base_prompt = "‰ºöË©±„ÅÆ„Åø„ÇíÊõ∏„ÅçËµ∑„Åì„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇÂëºÂê∏Èü≥„ÄÅ„ÅÇ„Åà„ÅéÂ£∞„ÅØÁÑ°Ë¶ñ„ÄÇ"
if "Standard" in genre_context:
    final_prompt = base_prompt + "„Çø„É°Âè£„ÄÅÊó•Â∏∏‰ºöË©±„ÄÇ„ÇÑ„ÄÅ„ÅÑ„ÇÑ„ÄÅ„ÅÜ„Çì„ÄÅ„Åô„Åî„ÅÑ„ÄÅÊ∞óÊåÅ„Å°„ÅÑ„ÅÑ„ÄÅ„ÅØ„ÅÑ„ÄÇ"
elif "Hardcore" in genre_context:
    final_prompt = base_prompt + "ÂëΩ‰ª§ÂΩ¢„ÄÅÊøÄ„Åó„ÅÑË®ÄËëâÈÅ£„ÅÑ„ÄÇ„ÇÑ„ÇÅ„Å¶„ÄÅ„Å†„ÇÅ„ÄÅË®±„Åó„Å¶„ÄÅ„Ç§„Åè„ÄÇ"
else:
    final_prompt = base_prompt + "Èñ¢Ë•øÂºÅ„ÄÅÂ§ßÈò™ÂºÅ„ÄÅÊñπË®Ä„ÄÇ„Åª„Çì„Åæ„ÄÅ„Å™„Çì„Åß„ÄÅ„Å°„ÇÉ„ÅÜ„ÄÅ„Åõ„ÇÑ„Å™„ÄÇ"

# ==========================================
# üöÄ MAIN SCRIPT
# ==========================================

def format_timedelta(seconds):
    return datetime.timedelta(seconds=seconds)

def cleanup_text(text):
    return re.sub(r'[\W_]+', '', text.lower())

# Check Dependencies
if 'loaded_model' not in globals():
    print("‚ùå Error: Model not loaded! Please run Step 3.1 first.")
elif not os.path.exists(audio_file_to_transcribe):
    print(f"‚ùå Error: File '{audio_file_to_transcribe}' not found!")
    print("   ‚ÑπÔ∏è  Please check the filename you set in Step 2.")
else:
    print(f"\nüîπ Transcribing: {audio_file_to_transcribe}")
    print(f"   ‚öôÔ∏è  Silero VAD: {use_silero_vad}")
    print(f"   ‚öôÔ∏è  Internal Logic: {internal_sensitivity} (Thresh: {speech_threshold})")

    try:
        # 1. RUN WHISPER
        result = whisper.transcribe(
            loaded_model,
            audio_file_to_transcribe,
            language="ja",

            # Accuracy
            beam_size=5,
            best_of=5,
            temperature=0.0,

            # Sync
            trust_whisper_timestamps=False,

            # --- VAD SETTINGS ---
            vad=use_silero_vad,
            no_speech_threshold=speech_threshold,

            # --- GUARD RAILS ---
            logprob_threshold=logprob_cutoff,
            compression_ratio_threshold=2.2,

            # Context
            detect_disfluencies=True,
            condition_on_previous_text=False,
            initial_prompt=final_prompt
        )
        print("   ‚úÖ Transcription Complete. Filtering...")

    except Exception as e:
        print(f"   ‚ùå Transcription Error: {e}")
        if "out of memory" in str(e).lower():
            print("   ‚ö†Ô∏è OOM: Restart Runtime -> Step 2.5 -> Step 3.1 (Use v2)")
        raise

    # 2. FILTERS (JAV OPTIMIZED)
    hallucination_triggers = [
        "thank you for watching", "thanks for watching", "please subscribe",
        "subscribe", "sub by", "translated by", "amara", "viewing",
        "see you next", "bye", "the end", "like and", "follow me",
        "Â≠óÂπï", "Ë¶ñËÅ¥", "„ÉÅ„É£„É≥„Éç„É´", "ÁôªÈå≤", "È´òË©ï‰æ°"
    ]

    # Safe Garbage List
    garbage_exact_matches = {
        "aa", "ah", "ahh", "haa", "hah", "haha", "mm", "mmm", "hmm",
        "oh", "huh", "o", "m", "h", "eh", "uh", "uhh",
        "„ÅÇ", "„ÅÇ„ÅÇ", "„ÅÇ„Å£", "„ÅÇ„Éº", "„Çì„Çì", "„ÅÜ", "„ÅÜ„Å£",
        "„ÅØ„ÅÅ", "„ÅØ„ÅÇ", "„Åµ", "„Åµ„ÅÖ", "„Åè", "„Åè„Å£"
    }

    final_subs = []
    sub_index = 1

    for segment in result["segments"]:
        text = segment["text"].strip()
        text_lower = text.lower()
        duration = segment["end"] - segment["start"]

        if duration < 0.2: continue
        if any(h in text_lower for h in hallucination_triggers): continue

        clean = cleanup_text(text_lower)
        words = clean.split()
        if not words: continue

        is_garbage = True
        for w in words:
            if w not in garbage_exact_matches:
                is_garbage = False
                break
        if is_garbage: continue

        if len(words) > 4 and len(set(words)) == 1: continue

        final_subs.append(
            srt.Subtitle(
                index=sub_index,
                start=format_timedelta(segment["start"]),
                end=format_timedelta(segment["end"]),
                content=text
            )
        )
        sub_index += 1

    # 3. SAVE & DOWNLOAD
    with open(output_srt, "w", encoding="utf-8") as f:
        f.write(srt.compose(final_subs))

    print(f"   üíæ Saved: {output_srt}")
    try:
        files.download(output_srt)
    except Exception as e:
        print(f"   ‚ö†Ô∏è Manual Download Required: {e}")


üîπ Transcribing: 1234_mastered.wav
   ‚öôÔ∏è  Silero VAD: False
   ‚öôÔ∏è  Internal Logic: High Sensitivity (Plot/Whispers) (Thresh: 0.25)


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 173690/191690 [08:34<00:53, 337.77frames/s]


   ‚úÖ Transcription Complete. Filtering...
   üíæ Saved: 1234.srt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# @title üßπ Step 3.3: Clear GPU Memory
import torch
import gc

print("üßπ Cleaning up GPU memory...")

if 'loaded_model' in globals():
    del loaded_model
    print("   ‚úÖ Model deleted from memory.")
else:
    print("   ‚ÑπÔ∏è No model was loaded.")

gc.collect()
torch.cuda.empty_cache()

print("‚úÖ Memory Cleared!")
print("üîÑ You can now run Step 2 again for a new file.")

üßπ Cleaning up GPU memory...
   ‚úÖ Model deleted from memory.
‚úÖ Memory Cleared!
üîÑ You can now run Step 2 again for a new file.
