In [None]:
!pip install coqui-tts

In [None]:
!pip install srt

In [None]:
!pip install deep_translator

In [None]:
from google.colab import files
files.upload()

In [None]:
!ls

In [6]:
!mkdir temp

In [7]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [8]:
!git clone https://github.com/Rudrabha/Wav2Lip.git

Cloning into 'Wav2Lip'...
remote: Enumerating objects: 409, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 409 (delta 2), reused 0 (delta 0), pack-reused 405 (from 2)[K
Receiving objects: 100% (409/409), 549.28 KiB | 21.97 MiB/s, done.
Resolving deltas: 100% (227/227), done.


In [9]:
!cp drive/MyDrive/wav2lip_gan.pth Wav2Lip/checkpoints/

In [None]:
!python translate.py --video_file "Tanzania-2.mp4" --transcript_srt "input.srt" --output_dir "output" # --do_lipsync

In [None]:
from IPython.display import Video

video_path = "output/translated_video.mp4"
Video(video_path, embed=True, width=720, height=480)

In [None]:
!cd output && cat translated.srt

## Fix Wav2Lip

In [None]:
%%writefile Wav2Lip/inference.py
from os import listdir, path
import numpy as np
import scipy, cv2, os, sys, argparse, audio
import json, subprocess, random, string
from tqdm import tqdm
from glob import glob
import torch, face_detection
from models import Wav2Lip
import platform

parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')

parser.add_argument('--checkpoint_path', type=str,
					help='Name of saved checkpoint to load weights from', required=True)

parser.add_argument('--face', type=str,
					help='Filepath of video/image that contains faces to use', required=True)
parser.add_argument('--audio', type=str,
					help='Filepath of video/audio file to use as raw audio source', required=True)
parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.',
								default='results/result_voice.mp4')

parser.add_argument('--static', type=bool,
					help='If True, then use only first video frame for inference', default=False)
parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)',
					default=25., required=False)

parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
					help='Padding (top, bottom, left, right). Please adjust to include chin at least')

parser.add_argument('--face_det_batch_size', type=int,
					help='Batch size for face detection', default=16)
parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)

parser.add_argument('--resize_factor', default=1, type=int,
			help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')

parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
					help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
					'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')

parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1],
					help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
					'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')

parser.add_argument('--rotate', default=False, action='store_true',
					help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
					'Use if you get a flipped result, despite feeding a normal looking video')

parser.add_argument('--nosmooth', default=False, action='store_true',
					help='Prevent smoothing face detections over a short temporal window')

args = parser.parse_args()
args.img_size = 96

if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
	args.static = True

def get_smoothened_boxes(boxes, T):
	for i in range(len(boxes)):
		if i + T > len(boxes):
			window = boxes[len(boxes) - T:]
		else:
			window = boxes[i : i + T]
		boxes[i] = np.mean(window, axis=0)
	return boxes

def face_detect(images):
	detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
											flip_input=False, device=device)

	batch_size = args.face_det_batch_size

	while 1:
		predictions = []
		try:
			for i in tqdm(range(0, len(images), batch_size)):
				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
		except RuntimeError:
			if batch_size == 1:
				raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
			batch_size //= 2
			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
			continue
		break

	results = []
	pady1, pady2, padx1, padx2 = args.pads
	for rect, image in zip(predictions, images):
		if rect is None:
			cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
			raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')

		y1 = max(0, rect[1] - pady1)
		y2 = min(image.shape[0], rect[3] + pady2)
		x1 = max(0, rect[0] - padx1)
		x2 = min(image.shape[1], rect[2] + padx2)

		results.append([x1, y1, x2, y2])

	boxes = np.array(results)
	if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]

	del detector
	return results

def datagen(frames, mels):
	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if args.box[0] == -1:
		if not args.static:
			face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
		else:
			face_det_results = face_detect([frames[0]])
	else:
		print('Using the specified bounding box instead of face detection...')
		y1, y2, x1, x2 = args.box
		face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]

	for i, m in enumerate(mels):
		idx = 0 if args.static else i%len(frames)
		frame_to_save = frames[idx].copy()
		face, coords = face_det_results[idx].copy()

		face = cv2.resize(face, (args.img_size, args.img_size))

		img_batch.append(face)
		mel_batch.append(m)
		frame_batch.append(frame_to_save)
		coords_batch.append(coords)

		if len(img_batch) >= args.wav2lip_batch_size:
			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

			img_masked = img_batch.copy()
			img_masked[:, args.img_size//2:] = 0

			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

			yield img_batch, mel_batch, frame_batch, coords_batch
			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if len(img_batch) > 0:
		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

		img_masked = img_batch.copy()
		img_masked[:, args.img_size//2:] = 0

		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

		yield img_batch, mel_batch, frame_batch, coords_batch

mel_step_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} for inference.'.format(device))

def _load(checkpoint_path):
	if device == 'cuda':
		checkpoint = torch.load(checkpoint_path, weights_only=False)
	else:
		checkpoint = torch.load(checkpoint_path,
								map_location=lambda storage, loc: storage)
	return checkpoint

def load_model(path):
	model = Wav2Lip()
	print("Load checkpoint from: {}".format(path))
	checkpoint = _load(path)
	s = checkpoint['state_dict']
	new_s = {}
	for k, v in s.items():
		new_s[k.replace('module.', '')] = v
	model.load_state_dict(new_s)

	model = model.to(device)
	return model.eval()

def main():
	if not os.path.isfile(args.face):
		raise ValueError('--face argument must be a valid path to video/image file')

	elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
		full_frames = [cv2.imread(args.face)]
		fps = args.fps

	else:
		video_stream = cv2.VideoCapture(args.face)
		fps = video_stream.get(cv2.CAP_PROP_FPS)

		print('Reading video frames...')

		full_frames = []
		while 1:
			still_reading, frame = video_stream.read()
			if not still_reading:
				video_stream.release()
				break
			if args.resize_factor > 1:
				frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))

			if args.rotate:
				frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)

			y1, y2, x1, x2 = args.crop
			if x2 == -1: x2 = frame.shape[1]
			if y2 == -1: y2 = frame.shape[0]

			frame = frame[y1:y2, x1:x2]

			full_frames.append(frame)

	print ("Number of frames available for inference: "+str(len(full_frames)))

	if not args.audio.endswith('.wav'):
		print('Extracting raw audio...')
		command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')

		subprocess.call(command, shell=True)
		args.audio = 'temp/temp.wav'

	wav = audio.load_wav(args.audio, 16000)
	mel = audio.melspectrogram(wav)
	print(mel.shape)

	if np.isnan(mel.reshape(-1)).sum() > 0:
		raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')

	mel_chunks = []
	mel_idx_multiplier = 80./fps
	i = 0
	while 1:
		start_idx = int(i * mel_idx_multiplier)
		if start_idx + mel_step_size > len(mel[0]):
			mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
			break
		mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
		i += 1

	print("Length of mel chunks: {}".format(len(mel_chunks)))

	full_frames = full_frames[:len(mel_chunks)]

	batch_size = args.wav2lip_batch_size
	gen = datagen(full_frames.copy(), mel_chunks)

	for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
											total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
		if i == 0:
			model = load_model(args.checkpoint_path)
			print ("Model loaded")

			frame_h, frame_w = full_frames[0].shape[:-1]
			out = cv2.VideoWriter('temp/result.avi',
									cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))

		img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
		mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)

		with torch.no_grad():
			pred = model(mel_batch, img_batch)

		pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.

		for p, f, c in zip(pred, frames, coords):
			y1, y2, x1, x2 = c
			p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

			f[y1:y2, x1:x2] = p
			out.write(f)

	out.release()

	command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile)
	subprocess.call(command, shell=platform.system() != 'Windows')

if __name__ == '__main__':
	main()

In [None]:
%%writefile Wav2Lip/audio.py
import librosa
import librosa.filters
import numpy as np
# import tensorflow as tf
from scipy import signal
from scipy.io import wavfile
from hparams import hparams as hp

def load_wav(path, sr):
    return librosa.core.load(path, sr=sr)[0]

def save_wav(wav, path, sr):
    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
    #proposed by @dsmiller
    wavfile.write(path, sr, wav.astype(np.int16))

def save_wavenet_wav(wav, path, sr):
    librosa.output.write_wav(path, wav, sr=sr)

def preemphasis(wav, k, preemphasize=True):
    if preemphasize:
        return signal.lfilter([1, -k], [1], wav)
    return wav

def inv_preemphasis(wav, k, inv_preemphasize=True):
    if inv_preemphasize:
        return signal.lfilter([1], [1, -k], wav)
    return wav

def get_hop_size():
    hop_size = hp.hop_size
    if hop_size is None:
        assert hp.frame_shift_ms is not None
        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
    return hop_size

def linearspectrogram(wav):
    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
    S = _amp_to_db(np.abs(D)) - hp.ref_level_db

    if hp.signal_normalization:
        return _normalize(S)
    return S

def melspectrogram(wav):
    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db

    if hp.signal_normalization:
        return _normalize(S)
    return S

def _lws_processor():
    import lws
    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")

def _stft(y):
    if hp.use_lws:
        return _lws_processor(hp).stft(y).T
    else:
        return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)

##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
    """Compute number of time frames of spectrogram
    """
    pad = (fsize - fshift)
    if length % fshift == 0:
        M = (length + pad * 2 - fsize) // fshift + 1
    else:
        M = (length + pad * 2 - fsize) // fshift + 2
    return M


def pad_lr(x, fsize, fshift):
    """Compute left and right padding
    """
    M = num_frames(len(x), fsize, fshift)
    pad = (fsize - fshift)
    T = len(x) + 2 * pad
    r = (M - 1) * fshift + fsize - T
    return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]

# Conversions
_mel_basis = None

def _linear_to_mel(spectogram):
    global _mel_basis
    if _mel_basis is None:
        _mel_basis = _build_mel_basis()
    return np.dot(_mel_basis, spectogram)

def _build_mel_basis():
    assert hp.fmax <= hp.sample_rate // 2
    return librosa.filters.mel(sr=hp.sample_rate, n_fft= hp.n_fft, n_mels=hp.num_mels,
                               fmin=hp.fmin, fmax=hp.fmax)

def _amp_to_db(x):
    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
    return 20 * np.log10(np.maximum(min_level, x))

def _db_to_amp(x):
    return np.power(10.0, (x) * 0.05)

def _normalize(S):
    if hp.allow_clipping_in_normalization:
        if hp.symmetric_mels:
            return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
                           -hp.max_abs_value, hp.max_abs_value)
        else:
            return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)

    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
    if hp.symmetric_mels:
        return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
    else:
        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))

def _denormalize(D):
    if hp.allow_clipping_in_normalization:
        if hp.symmetric_mels:
            return (((np.clip(D, -hp.max_abs_value,
                              hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
                    + hp.min_level_db)
        else:
            return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)

    if hp.symmetric_mels:
        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
    else:
        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)

## Test SRT

In [None]:
%%writefile input.srt
1
00:00:00,000 --> 00:00:04,436
Tanzania—home to some of the most breathtaking wildlife on Earth.

2
00:00:04,436 --> 00:00:13,308
Here, in the heart of East Africa, the great Serengeti National Park
hosts one of nature’s greatest spectacles: the Great Migration.

3
00:00:13,308 --> 00:00:22,417
Over a million wildebeest, zebras, and gazelles travel vast distances
in search of fresh grass, braving rivers filled with crocodiles.

4
00:00:22,417 --> 00:00:24,728
But predators are never far behind.

5
00:00:24,728 --> 00:00:29,813
Lions, the kings of the savanna, stalk their prey
with patience and precision.

6
00:00:29,813 --> 00:00:35,900
Cheetahs, the fastest land animals, chase down their targets in a
thrilling display of speed.

7
00:00:35,900 --> 00:00:39,824
In the lush Tarangire National Park, giant elephants roam freely.

8
00:00:39,824 --> 00:00:45,360
These intelligent creatures form strong family bonds, protecting
their young from threats.

9
00:00:45,360 --> 00:00:52,260
And in the ancient Ngorongoro Crater, the endangered black rhino
finds refuge, a rare sight in the wild.

10
00:00:52,260 --> 00:01:00,727
Tanzania’s wildlife is a treasure like no other—a delicate balance of
nature that reminds us of the beauty and power of the wild.

## Main Script

In [None]:
%%writefile translate.py
import os
os.environ.setdefault("XDG_RUNTIME_DIR", "/tmp")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["ABSL_LOGGING_MIN_LOG_LEVEL"] = "3"
os.environ["GLOG_minloglevel"] = "3"
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GRPC_TRACE"] = ""
os.environ["PYTHONWARNINGS"] = "ignore"
os.environ.setdefault("SDL_AUDIODRIVER", "dummy")
os.environ.setdefault("AUDIODEV", "null")

os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


import argparse
import subprocess
import srt
from datetime import timedelta

import numpy as np
import librosa
import soundfile as sf
import moviepy.editor as mpy
import torch
from deep_translator import GoogleTranslator
from TTS.api import TTS
import tempfile



def parse_arguments():
    p = argparse.ArgumentParser()
    p.add_argument("--video_file", required=True)
    p.add_argument("--transcript_srt", required=True)
    p.add_argument("--output_dir", default="output")
    p.add_argument("--do_lipsync")
    p.add_argument("--min_gap", type=float, default=0.12)
    p.add_argument("--trim_db", type=float, default=35.0)
    return p.parse_args()


def translate_en_to_de(text: str) -> str:
    # Translate wrapper
    return GoogleTranslator(source="en", target="de").translate(text)


def trim_silence(y: np.ndarray, sr: int, top_db: float = 35.0) -> np.ndarray:
    if y.size == 0:
        return y
    y_trimmed, _ = librosa.effects.trim(y, top_db=top_db)
    return y_trimmed if y_trimmed.size > 0 else y


def synthesize_line(tts, text_de: str, speaker_wav: str, language: str = "de") -> tuple[np.ndarray, int]:
    """
    Returns (mono float32 waveform, sample_rate).
    """
    # Create temp wav path
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tmp_path = tmp.name
    tmp.close()
    try:
        # Write TTS directly to file
        tts.tts_to_file(text=text_de, speaker_wav=speaker_wav, language=language, file_path=tmp_path)
        y, sr = sf.read(tmp_path, dtype="float32")
        # Downmix to mono if needed
        if y.ndim == 2:
            y = y.mean(axis=1)
        return y.astype(np.float32), int(sr)
    except Exception:
        # Short silence at 24 kHz
        sr = 24000
        y = np.zeros(int(sr * 0.25), dtype=np.float32)
        return y, sr
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass

def ensure_length(y: np.ndarray, sr: int, min_seconds: float = 0.25) -> np.ndarray:
    """Pad short lines so they remain audible long enough."""
    min_len = int(sr * min_seconds)
    if y.size < min_len:
        pad = np.zeros(min_len - y.size, dtype=y.dtype)
        y = np.concatenate([y, pad])
    return y


def build_german_audio_and_timeline(subs, tts, speaker_wav: str, min_gap: float, trim_db: float):
    """
    For each subtitle:
      -Translate to german
      -TTS to get German speech audio and duration
    Return:
      -Concatenated German audio (np.ndarray, sr)
      -List of (start_sec, end_sec, text_de) before time-stretch
      -total_duration_sec
    """
    german_lines = []
    segments = []
    segments_len = []
    gaps = []
    device_sr = None

    # Precompute original gaps
    for i in range(len(subs)):
        if i < len(subs) - 1:
            gap = (subs[i+1].start - subs[i].end).total_seconds()
            gaps.append(max(min_gap, max(0.0, gap)))
        else:
            gaps.append(0.0)  # no gap after last line

    # Translate and synthesize each line
    for i, sub in enumerate(subs):
        en = sub.content.strip()
        de = translate_en_to_de(en) if en else ""
        audio, sr = synthesize_line(tts, de, speaker_wav, language="de")
        device_sr = device_sr or sr
        if device_sr is None:
            device_sr = sr
        elif sr != device_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=device_sr)
            sr = device_sr
        audio = trim_silence(audio, sr, top_db=trim_db)
        audio = ensure_length(audio, sr, min_seconds=0.25)

        dur = librosa.get_duration(y=audio, sr=sr)
        segments.append(audio)
        segments_len.append(dur)
        german_lines.append(de)

    # Concatenate with original gaps
    sr = device_sr or 22050
    concat_audio = []
    timeline = []  # (start_sec, end_sec, text_de) before stretch
    cursor = 0.0

    for i, (seg, seg_len, text_de) in enumerate(zip(segments, segments_len, german_lines)):
        # Add speech
        concat_audio.append(seg)
        start = cursor
        end = cursor + seg_len
        timeline.append((start, end, text_de))
        cursor = end

        # Add gap the line
        gap_sec = gaps[i]
        if gap_sec > 0:
            silence = np.zeros(int(round(sr * gap_sec)), dtype=np.float32)
            concat_audio.append(silence)
            cursor += gap_sec

    y_full = np.concatenate(concat_audio) if len(concat_audio) else np.zeros(0, dtype=np.float32)
    total_dur = librosa.get_duration(y=y_full, sr=sr) if y_full.size > 0 else 0.0
    return y_full, sr, timeline, total_dur


def scale_timeline(timeline, rate: float):
    scaled = []
    inv = 1.0 / rate if rate != 0 else 1.0
    for start, end, text in timeline:
        scaled.append((start * inv, end * inv, text))
    return scaled


def write_srt_from_timeline(timeline_scaled, out_path: str):
    items = []
    for idx, (start_s, end_s, text) in enumerate(timeline_scaled, start=1):
        # Ensure non-negative and minimum 0.3s visibility
        start_s = max(0.0, start_s)
        if end_s <= start_s + 0.3:
            end_s = start_s + 0.3
        items.append(
            srt.Subtitle(
                index=idx,
                start=timedelta(seconds=start_s),
                end=timedelta(seconds=end_s),
                content=text or ""
            )
        )
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(srt.compose(items))


def main():
    args = parse_arguments()
    os.makedirs(args.output_dir, exist_ok=True)

    # Load video and audio
    video = mpy.VideoFileClip(args.video_file)
    original_duration = video.duration
    original_audio_path = os.path.join(args.output_dir, "original_audio.wav")
    video.audio.write_audiofile(original_audio_path, verbose=False, logger=None)

    # Read English SRT
    with open(args.transcript_srt, "r", encoding="utf-8") as f:
        subs = list(srt.parse(f.read()))

    device = "cuda" if torch.cuda.is_available() else "cpu"
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

    # German audio and timeline
    y_de, sr, timeline, dur_de = build_german_audio_and_timeline(
        subs=subs,
        tts=tts,
        speaker_wav=original_audio_path,
        min_gap=args.min_gap,
        trim_db=args.trim_db
    )

    translated_audio_path_raw = os.path.join(args.output_dir, "translated_audio_raw.wav")
    sf.write(translated_audio_path_raw, y_de, sr)
    print(f"Raw German audio: {translated_audio_path_raw} (duration: {dur_de:.2f}s)")

    # Uniformly adjust audio speed to match original video duration
    if dur_de > 0 and original_duration > 0:
        rate = dur_de / float(original_duration)
    else:
        rate = 1.0

    if rate != 1.0:
        y_adj = librosa.effects.time_stretch(y_de, rate=rate)
    else:
        y_adj = y_de

    adjusted_audio_path = os.path.join(args.output_dir, "translated_audio.wav")
    sf.write(adjusted_audio_path, y_adj, sr)
    print(f"Adjusted German audio: {adjusted_audio_path}")

    # Scale German timeline so SRT matches altered audio
    timeline_scaled = scale_timeline(timeline, rate=rate)

    # Write German SRT
    translated_srt_path = os.path.join(args.output_dir, "translated.srt")
    write_srt_from_timeline(timeline_scaled, translated_srt_path)
    print(f"German-aligned SRT written: {translated_srt_path}")

    # Final video with the adjusted German audio
    new_audio = mpy.AudioFileClip(adjusted_audio_path)
    new_video = video.set_audio(new_audio)
    translated_video_path = os.path.join(args.output_dir, "translated_video.mp4")
    new_video.write_videofile(
        translated_video_path,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile=os.path.join(args.output_dir, "temp-audio.m4a"),
        remove_temp=True,
        verbose=False,
        logger=None
    )
    print(f"Final translated video: {translated_video_path}")

    # Wav2Lip lipsync
    if args.do_lipsync:
        wav2lip_dir = "Wav2Lip"
        checkpoint_path = os.path.join(wav2lip_dir, "checkpoints", "wav2lip_gan.pth")
        lipsync_video_path = os.path.join(args.output_dir, "lipsync_video.mp4")
        cmd = [
            "python", os.path.join(wav2lip_dir, "inference.py"),
            "--checkpoint_path", checkpoint_path,
            "--face", translated_video_path,
            "--audio", adjusted_audio_path,
            "--outfile", lipsync_video_path
        ]
        subprocess.run(cmd, check=False)
        print(f"Lip-synced video: {lipsync_video_path}")


if __name__ == "__main__":
    main()