In [1]:
# Install Wav2Lip and dependencies
!pip install tensorflow transformers gTTS moviepy SpeechRecognition pocketsphinx git+https://github.com/openai/whisper.git torchaudio

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-3zspwojh
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-3zspwojh
  Resolved https://github.com/openai/whisper.git to commit 5979f03701209bb035a0a466f14131aeb1116cbb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gTTS
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting pocketsphinx
  Downloading pocketsphinx-5.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting sounddevice (from pocketsphinx)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting tiktoken (from openai-whisper==20240930

In [2]:
# Clone Wav2Lip repository
!git clone https://github.com/Rudrabha/Wav2Lip.git

# Modify the requirements.txt file to remove specific versions for opencv-python, torch, and torchvision
!sed -i '/opencv-python==4.1.0.25/d' Wav2Lip/requirements.txt
!sed -i '/torch==1.1.0/d' Wav2Lip/requirements.txt
!sed -i '/torchvision==0.3.0/d' Wav2Lip/requirements.txt

# Now install the rest of the requirements
!pip install -r Wav2Lip/requirements.txt

# Clear pip cache to prevent using any cached versions of opencv-contrib-python
!pip cache purge

# Manually install compatible versions of torch, torchvision, torchaudio, and opencv-contrib-python
!pip install torch torchvision torchaudio --no-cache-dir
!pip install opencv-contrib-python --no-cache-dir

Cloning into 'Wav2Lip'...
remote: Enumerating objects: 393, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 393 (delta 8), reused 5 (delta 1), pack-reused 378 (from 1)[K
Receiving objects: 100% (393/393), 538.59 KiB | 2.68 MiB/s, done.
Resolving deltas: 100% (218/218), done.
Collecting librosa==0.7.0 (from -r Wav2Lip/requirements.txt (line 1))
  Downloading librosa-0.7.0.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.17.1 (from -r Wav2Lip/requirements.txt (line 2))
  Downloading numpy-1.17.1.zip (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tqdm==4.45.0 (from -r Wav2Lip/requirements.txt (line 4))
 

In [3]:
# Download the pretrained Wav2Lip model file
!gdown "https://drive.google.com/uc?export=download&id=1-DXo5_dUP5oJWG3XBj7dyI4KfLGzhQ4d"

Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?export=download&id=1-DXo5_dUP5oJWG3XBj7dyI4KfLGzhQ4d

but Gdown can't. Please check connections and permissions.


In [2]:
# Import necessary libraries
import moviepy.editor as mp
import whisper
from transformers import MarianMTModel, MarianTokenizer
from transformers import pipeline
from gtts import gTTS
from google.colab import files
import os

In [3]:
# Function to load translation models once
def load_translation_models(target_languages):
    translation_models = {}
    for lang in target_languages:
        model_name = f"Helsinki-NLP/opus-mt-en-{lang}"
        model = MarianMTModel.from_pretrained(model_name)
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        translation_models[lang] = (model, tokenizer)
    return translation_models

In [4]:
# Function to translate text using preloaded models
def translate_text(text, target_language, translation_models):
    model, tokenizer = translation_models[target_language]
    input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
    output = model.generate(input_ids)
    translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return translated_text

In [5]:
# Step 1: Upload the video file
uploaded = files.upload()
video_path = list(uploaded.keys())[0]  # Get the uploaded video file name
video = mp.VideoFileClip(video_path)
# Step 2: Extract and save audio from video
audio = video.audio
audio.write_audiofile("audio.wav")

Saving test_video.mp4 to test_video.mp4
MoviePy - Writing audio in audio.wav


                                                        

MoviePy - Done.




In [6]:
# Step 3: Load and transcribe the audio file using a larger model
model = whisper.load_model("large")  # Use a larger model for better accuracy
audio_path = "audio.wav"
result = model.transcribe(audio_path)
transcribed_text = result["text"]
print("Transcribed Text: ", transcribed_text)

# Step 4: Emotion Detection (initialize once)
emotion_classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion")
emotion_result = emotion_classifier(transcribed_text)
detected_emotion = emotion_result[0]['label']
print(f"Detected Emotion: {detected_emotion}")

  checkpoint = torch.load(fp, map_location=device)




Transcribed Text:   Hello, my name is Ritika Shethe. My highest qualification is BMS graduate.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.




Detected Emotion: joy


In [7]:
# Step 5: Define target languages for translation
target_languages = ["fr", "es", "de", "hi"]  # French, Spanish, German, Hindi
translation_models = load_translation_models(target_languages)  # Load models once
translations = {}

# Step 6: Translate to each language
for lang in target_languages:
    translations[lang] = translate_text(transcribed_text, lang, translation_models)
    print(f"Translation in {lang}: {translations[lang]}")

# Step 7: Combine translated audio with original video using Wav2Lip
for lang, translation in translations.items():
    # Adjust TTS parameters based on detected emotion
    if detected_emotion == "joy":
        tts = gTTS(text=translation, lang=lang, slow=False)  # Normal speed
    elif detected_emotion == "sadness":
        tts = gTTS(text=translation, lang=lang, slow=True)  # Slower for sadness
    elif detected_emotion == "anger":
        tts = gTTS(text=translation, lang=lang, slow=True)  # Slower for anger
    else:
        tts = gTTS(text=translation, lang=lang, slow=False)  # Default speed




Translation in fr: Bonjour, je m'appelle Ritika Shethe. Ma plus haute qualification est diplômée de BMS.
Translation in es: Hola, mi nombre es Ritika Shethe.
Translation in de: Hallo, mein Name ist Ritika Shethe. Meine höchste Qualifikation ist BMS-Absolventen.
Translation in hi: हैलो, मेरा नाम रिटिका शेर है. मेरी सबसे उच्च विशेषता BMMS स्नातक है.


In [8]:
# Save the translated audio
tts.save(f"translated_audio_{lang}.mp3")

# Use Wav2Lip for lip-syncing
!python Wav2Lip/inference.py --checkpoint_path wav2lip_gan.pth --face {video_path} --audio translated_audio_{lang}.mp3 --outfile dubbed_video_{lang}.mp4

print("Lip-synced videos created for each target language.")

Using cpu for inference.
Reading video frames...
Number of frames available for inference: 157
Extracting raw audio...
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libs