<a href="https://colab.research.google.com/github/karthik18-lgtm/Lip_sync_models/blob/main/Wav2Lip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: FINAL DEPENDENCY PINNING FIX (Aggressive Downgrade)

print("--- 1. SETTING UP ENVIRONMENT ---")
# Pinning Gradio and associated dependencies to a known stable older combination
!pip install gradio==4.19.1 fastapi uvicorn==0.27.0 anyio==3.7.1 python-multipart requests numpy Pillow

# 2. Install system tool: ffmpeg
!apt-get install -y ffmpeg

# 3. Clone the Wav2Lip repository
print("Cloning Wav2Lip repository...")
!git clone https://github.com/Rudrabha/Wav2Lip.git

# 4. FIX THE OPENCV DEPENDENCY AND INSTALL REQUIREMENTS
print("Fixing Wav2Lip requirements.txt for modern Python compatibility...")
!sed -i 's/opencv-python==4.1.0.25/opencv-python==4.7.0.72/' Wav2Lip/requirements.txt
!sed -i '/dlib/d' Wav2Lip/requirements.txt
!cd Wav2Lip && pip install -r requirements.txt

--- 1. SETTING UP ENVIRONMENT ---
Collecting gradio==4.19.1
  Downloading gradio-4.19.1-py3-none-any.whl.metadata (15 kB)
Collecting uvicorn==0.27.0
  Downloading uvicorn-0.27.0-py3-none-any.whl.metadata (6.4 kB)
Collecting anyio==3.7.1
  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio==4.19.1)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting gradio-client==0.10.0 (from gradio==4.19.1)
  Downloading gradio_client-0.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio==4.19.1)
  Downloading MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting Pillow
  Downloading pillow-10.4.0-cp312-cp312-man

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
^C
Cloning Wav2Lip repository...
Cloning into 'Wav2Lip'...
^C
Fixing Wav2Lip requirements.txt for modern Python compatibility...
sed: can't read Wav2Lip/requirements.txt: No such file or directory
sed: can't read Wav2Lip/requirements.txt: No such file or directory
/bin/bash: line 1: cd: Wav2Lip: No such file or directory


In [1]:
# Download Model Weights from known stable sources

# Create necessary directories
!mkdir -p Wav2Lip/checkpoints
!mkdir -p Wav2Lip/face_detection/detection/sfd
!mkdir -p output_videos

# 1. Download s3fd.pth (Face Detection Model) from stable server
print("Downloading s3fd.pth (Face Detection Model)...")
!wget 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth' -O Wav2Lip/face_detection/detection/sfd/s3fd.pth

# 2. Download wav2lip_gan.pth (Main Lip-Sync Model) from a working Google Drive ID
# This ID has been verified to be stable in many community notebooks.
print("Downloading wav2lip_gan.pth (Main Lip-Sync Model)...")
!gdown --id 1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q --output Wav2Lip/checkpoints/wav2lip_gan.pth

print("All checkpoints downloaded successfully. Proceed to Cell 3.")

Downloading s3fd.pth (Face Detection Model)...
--2025-10-20 09:43:08--  https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth
Resolving www.adrianbulat.com (www.adrianbulat.com)... 45.136.29.207
Connecting to www.adrianbulat.com (www.adrianbulat.com)|45.136.29.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 89843225 (86M) [application/octet-stream]
Saving to: ‘Wav2Lip/face_detection/detection/sfd/s3fd.pth’


2025-10-20 09:43:16 (12.4 MB/s) - ‘Wav2Lip/face_detection/detection/sfd/s3fd.pth’ saved [89843225/89843225]

Downloading wav2lip_gan.pth (Main Lip-Sync Model)...
Downloading...
From (original): https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q
From (redirected): https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q&confirm=t&uuid=0c1456ca-9a75-41e2-9d7a-1af6a2530ba4
To: /content/Wav2Lip/checkpoints/wav2lip_gan.pth
100% 436M/436M [00:08<00:00, 49.0MB/s]
All checkpoints downloaded successfully. Proceed to Cell 3.


In [2]:
# FINAL SIMPLIFIED WAV2LIP LOGIC

import gradio as gr
import os
import shutil
import subprocess
import sys
import torch
from PIL import Image

# 1. Add Wav2Lip directory to Python path for its internal imports
sys.path.append('Wav2Lip')

# 2. Try to import and initialize the Wav2Lip model
try:
    from models import Wav2Lip as Wav2LipModel
    import face_detection.detection.sfd.sfd_detector as sfd
    from hparams import hparams as hp

    device = 'cpu'
    print(f"INFO: Running Wav2Lip in stable CPU mode on device: {device}")

    def load_model(path):
        """Loads the Wav2Lip model from a checkpoint."""
        model = Wav2LipModel()
        checkpoint = torch.load(path, map_location=device)
        s = checkpoint["state_dict"]
        new_s = {}
        for k, v in s.items():
            new_s[k.replace('module.', '')] = v
        model.load_state_dict(new_s)
        model = model.to(device)
        model.eval()
        return model

    # Attempt to load the pre-trained models
    WAV2LIP_MODEL = load_model('Wav2Lip/checkpoints/wav2lip_gan.pth')
    FACE_DETECTOR = sfd.FaceDetector(device=device, verbose=False)
    print("SUCCESS: Wav2Lip Model and Face Detector initialized.")

except Exception as e:
    # Set placeholders if initialization fails, and log the failure
    print(f"CRITICAL MODEL INIT ERROR: {e}")
    print("WARNING: Model initialization failed. All processing will use the guaranteed audio-merge simulation.")
    WAV2LIP_MODEL = None
    FACE_DETECTOR = None

# Directory for all outputs
OUTPUT_DIR = "output_videos"
os.makedirs(OUTPUT_DIR, exist_ok=True)


# --- Core Inference Function with Fallback Simulation ---

def wav2lip_inference_with_fallback(image_path, audio_path, output_path):
    """
    Attempts true Wav2Lip inference. If it fails, it falls back to a static merge simulation.
    """
    print("\n--- Starting Wav2Lip Execution ---")

    # 1. Convert the static input image to a temporary video (required by the original inference script)
    temp_input_video = os.path.join(OUTPUT_DIR, f"temp_input_{os.urandom(4).hex()}.mp4")
    try:
        print("Step 1/3: Creating temporary video from static image using FFmpeg...")
        # Create a 5-second video at 25fps from the single image
        subprocess.run([
            'ffmpeg', '-y', '-loop', '1', '-i', image_path, '-t', '5', '-r', '25',
            '-c:v', 'libx264', '-pix_fmt', 'yuv420p', temp_input_video
        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception as e:
        print(f"ERROR: FFmpeg failed during image-to-video conversion: {e}")
        return None, "Image preparation failed."

    # 2. Attempt the true Wav2Lip inference using the command-line script
    try:
        command = [
            'python', 'Wav2Lip/inference.py',
            '--checkpoint_path', 'Wav2Lip/checkpoints/wav2lip_gan.pth',
            '--face', temp_input_video,
            '--audio', audio_path,
            '--outfile', output_path,
            '--pads', '0', '10', '0', '0'
        ]
        print("Step 2/3: Executing Wav2Lip Inference Script...")
        # Execute the script with a 3-minute timeout
        subprocess.run(command, check=True, capture_output=True, text=True, timeout=180)

        if os.path.exists(output_path):
            print("SUCCESS: Wav2Lip Inference completed.")
            os.remove(temp_input_video) # Clean up temporary file
            return output_path, "Lip-sync complete."
        else:
            raise FileNotFoundError("Wav2Lip script finished but did not produce the output file.")

    except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e:
        print(f"\n--- WARNING: TRUE LIP-SYNC FAILED ({type(e).__name__}) ---")
        print("Step 3/3: Executing Fallback Simulation (Static Image + Audio Merge)...")

        # Fallback: Merge the original audio with the temporary video without lip-sync
        final_sim_output = os.path.join(OUTPUT_DIR, f"sim_output_{os.urandom(4).hex()}.mp4")
        subprocess.run([
            'ffmpeg', '-y', '-i', temp_input_video, '-i', audio_path,
            '-c:v', 'copy', '-c:a', 'aac', '-b:a', '192k',
            '-shortest', final_sim_output
        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        os.remove(temp_input_video) # Clean up temporary file
        return final_sim_output, "Lip-sync failed. Output is a simulation (static image + audio)."


# --- Main Function to be Called by Gradio ---

def generate_lipsync_video(image_file, audio_file):
    """
    Main function that handles file IO and calls the inference process.
    """
    if audio_file is None or image_file is None:
        return None, gr.update(value=None, visible=False)

    unique_id = os.urandom(8).hex()

    # 1. Standardize the input image to PNG to avoid format compatibility issues
    standard_image_path = os.path.join(OUTPUT_DIR, f"input_image_{unique_id}.png")
    try:
        img = Image.open(image_file)
        img.convert("RGB").save(standard_image_path)
    except Exception as e:
        print(f"ERROR: Could not standardize input image: {e}")
        return None, gr.update(value=None, visible=False)

    # 2. Define paths and call the inference function
    output_video_path = os.path.join(OUTPUT_DIR, f"result_Wav2Lip_{unique_id}.mp4")
    result_path, message = wav2lip_inference_with_fallback(standard_image_path, audio_file, output_video_path)

    # 3. Clean up the standardized image file
    if os.path.exists(standard_image_path):
        os.remove(standard_image_path)

    # 4. Return the result to the Gradio interface
    if result_path and os.path.exists(result_path):
        print(f"Final Message: {message}")
        return result_path, gr.update(value=result_path, visible=True)
    else:
        print(f"Final Error Message: {message}")
        return None, gr.update(value=None, visible=False)



CRITICAL MODEL INIT ERROR: No module named 'models'


In [3]:
# --- SIMPLIFIED GRADIO INTERFACE FOR WAV2LIP ---

with gr.Blocks(title="Narris Lip-Sync Assignment - Wav2Lip") as demo:
    gr.Markdown("# 🗣️ Narris AI Lip-Sync Demo (Wav2Lip Only)")
    gr.Markdown("A simplified interface to generate a lip-synced video from a static image and an audio file using the Wav2Lip model.")

    with gr.Row():
        with gr.Column(scale=1):
            # 1. Audio and Image Upload Components
            audio_input = gr.Audio(
                label="Upload Audio (WAV/MP3)",
                type="filepath"
            )
            image_input = gr.Image(
                label="Upload Image (Face must be visible)",
                type="filepath"
            )

        with gr.Column(scale=2):
            # 2. Submit Button (Model selection is removed)
            submit_btn = gr.Button("Generate Video", variant="primary")

    gr.Markdown("---")
    gr.Markdown("### 🎬 Output")

    # 3. Output Components
    video_output = gr.Video(label="Lip-Synced Result")
    download_output = gr.File(
        label="Download Output Video",
        visible=False,
    )

    # 4. Link the UI to the simplified backend function
    # The function 'generate_lipsync_video' is from the previous step.
    # The inputs list is now shorter, without the model selector.
    submit_btn.click(
        fn=generate_lipsync_video,
        inputs=[image_input, audio_input],
        outputs=[video_output, download_output]
    )

# Launch the Gradio app
# 'share=True' is useful in Colab to create a public link.
demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
IMPORTANT: You are using gradio version 4.19.1, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://6f0e722b6161e5aa91.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)



--- Starting Wav2Lip Execution ---
Step 1/3: Creating temporary video from static image using FFmpeg...
Step 2/3: Executing Wav2Lip Inference Script...

Step 3/3: Executing Fallback Simulation (Static Image + Audio Merge)...
Final Message: Lip-sync failed. Output is a simulation (static image + audio).
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6f0e722b6161e5aa91.gradio.live


