#Video to translate and lipsync pipeline


A pipleline for voice-cloning and lip-syncing with Open-source tools and models.

Credits to:

*   https://github.com/coqui-ai/TTS
*   https://github.com/openai/whisper
*   https://github.com/justinjohn0306/Wav2Lip/

Notes:

- Make sure to check the licensing of each project before using it for commercial purposes.

## Part 1: extract, translate and generate voice

In [None]:
# @title Dependencies
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install TTS
!pip install git+https://github.com/openai/whisper.git
# !pip install jiwer
!pip install googletrans==4.0.0-rc1

In [None]:
#@title Upload Video

from google.colab import files
import os
import subprocess

uploaded = None
resize_to_720p = False

def upload_video():
  global uploaded
  global video_path  # Declare video_path as global to modify it
  uploaded = files.upload()
  for filename in uploaded.keys():
    print(f'Uploaded {filename}')
    if resize_to_720p:
        filename = resize_video(filename)  # Get the name of the resized video
    video_path = filename  # Update video_path with either original or resized filename
    return filename


def resize_video(filename):
    output_filename = f"resized_{filename}"
    cmd = f"ffmpeg -i {filename} -vf 'scale=-1:720' {output_filename}"
    subprocess.run(cmd, shell=True)
    print(f'Resized video saved as {output_filename}')
    return output_filename

# Create a form button that calls upload_video when clicked and a checkbox for resizing
import ipywidgets as widgets
from IPython.display import display

button = widgets.Button(description="Upload Video")
checkbox = widgets.Checkbox(value=False, description='Resize to 720p (better results)')
output = widgets.Output()

def on_button_clicked(b):
  with output:
    global video_path
    global resize_to_720p
    resize_to_720p = checkbox.value
    video_path = upload_video()

button.on_click(on_button_clicked)
display(checkbox, button, output)


In [None]:
# @title Audio extraction (24 bit) and whisper conversion
import subprocess
import whisper

# Ensure video_path variable exists and is not None
if 'video_path' in globals() and video_path is not None:
    print("here")
    ffmpeg_command = f"ffmpeg -i '{video_path}' -acodec pcm_s24le -ar 48000 -q:a 0 -map a -y 'output_audio_1.wav'"
    subprocess.run(ffmpeg_command, shell=True)
else:
    print("No video uploaded. Please upload a video first.")


model = whisper.load_model("base")
result = model.transcribe("output_audio_1.wav")

whisper_text = result["text"]
whisper_language = result['language']

print("Whisper text:", whisper_text)

In [None]:
#@title Translation with Google Translate
# Mapping between full names and ISO 639-1 codes
language_mapping = {
    'English': 'en',
    'Spanish': 'es',
    'French': 'fr',
    'German': 'de',
    'Italian': 'it',
    'Portuguese': 'pt',
    'Polish': 'pl',
    'Turkish': 'tr',
    'Russian': 'ru',
    'Dutch': 'nl',
    'Czech': 'cs',
    'Arabic': 'ar',
    'Chinese (Simplified)': 'zh-cn',
    'Hindi': 'hi',
}

# Dropdown with full names
target_language = "Hindi" #@param ["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Hindi"]

# Convert full name to ISO 639-1 code
target_language_code = language_mapping[target_language]

# Assume whisper_text and whisper_language are defined from previous code
from googletrans import Translator

# Initialize the translator
translator = Translator()

# Translate the text
translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text

# Output the translated text
print("Translated text:", translated_text)


In [None]:
#@title Translate with chatgpt
import openai

client = openai.OpenAI(api_key = "api_key")
completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"translate the texts to Hindi {whisper_text}"}
  ]
)
translated_text = completion.choices[0].message
print(translated_text)


In [None]:
# @title split texts into 250 character chunks (Hindi)
text_chunks = translated_text.split(sep = "।")
final_chunks = [""]
for chunk in text_chunks:
  if not final_chunks[-1] or len(final_chunks[-1])+len(chunk)<250:
    chunk += "।"
    final_chunks[-1]+=chunk.strip()
  else:
    final_chunks.append(chunk+"।".strip())
final_chunks

In [None]:
# @title Voice synthesis
from TTS.api import TTS
import torch
from IPython.display import Audio, display  # Import the Audio and display modules

device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Generate audio file
tts.tts_to_file(speech_fr,
    speaker_wav='output_audio_1.wav',
    file_path="output_synth_fr.wav",
    language="fr"
)
# Display audio widget to play the generated audio
audio_widget = Audio(filename="output_synth_fr.wav", autoplay=False)
display(audio_widget)

In [None]:
# @title Voice synthesis for text chunks
def audio_synthesis(text, file_name):
  tts.tts_to_file(
      text,
      speaker_wav='output_audio_1.wav',
      file_path=file_name,
      language="hi"
  )
  return file_name
file_names = []
for i in range(len(final_chunks)):
    file_name = audio_synthesis(final_chunks[i], f"output_synth_audio_{i}.wav")
    file_names.append(file_name)

# create a text file and add paths to files to be merged()
%touch my_files.txt

#concat audios
cmd = "ffmpeg -f concat -safe 0 -i my_files.txt -c copy output_synth_audio_final.wav"
subprocess.run(cmd, shell=True)

In [19]:
from IPython.display import Audio, display
audio_widget = Audio(filename="output_synth_audio_0.wav", autoplay=False)
display(audio_widget)
display(Audio(filename="output_synth_audio_1.wav", autoplay=False))

In [23]:
display(Audio(filename="output_synth_audio_final.wav", autoplay=False))

In [None]:
#@title voice synthesis with custom text
mutual_fund_speech = """
Listen up, folks, because I'm gonna tell you something important about mutual funds. They're amazing, truly incredible investments, the best way to grow your money, but let me be clear: there are risks, folks. I'm not gonna sugarcoat it.

The market goes up, it goes down, that's just the way it is. Interest rates fluctuate, things change – sometimes they're good, sometimes they're bad. Just like a beautiful rollercoaster, it's a thrill ride, but you gotta hold on tight.

Look, here's the deal: before you invest a dime, you gotta do your homework. Read all the fine print, talk to the experts, make sure you understand what you're getting into. This isn't a game, folks, it's your hard-earned money we're talking about.

And don't trust everything you read online. This website is great, terrific even, but if you need something official, get the printed version. Those are the real deal.

Remember, folks, I'm here to help you make America great again, and that means helping you make smart financial decisions. With the right approach, you can achieve financial success, believe me."""
tts.tts_to_file(mutual_fund_speech,
    speaker_wav='output_audio.wav',
    file_path="mutual_fund_speech.wav",
    language="en"
)

# Display audio widget to play the generated audio
audio_widget = Audio(filename="mutual_fund_speech.wav", autoplay=False)
display(audio_widget)

In [40]:
# @title Delete tts and whisper models before lip sync if you are on T4
import torch

try:
    del tts
except NameError:
    print("Voice model already deleted")

try:
    del model
except NameError:
    print("Whisper model already deleted")

torch.cuda.empty_cache()

Whisper model already deleted


## Part 2: generate lip synched video

### High Quality (very slow, aprox 15 min to install dependencies and 12 more for vid on T4)

In [None]:
# @title Dependencies
%cd /content/

!git clone https://github.com/vinthony/video-retalking.git &> /dev/null

!sudo apt-get install -y libblas-dev liblapack-dev libx11-dev libopenblas-dev

!git clone https://github.com/davisking/dlib.git

!pip install basicsr==1.4.2 face-alignment==1.3.4 kornia==0.5.1 ninja==1.10.2.3 einops==0.4.1 facexlib==0.2.5 librosa==0.9.2 build

!cd dlib && python setup.py install

%cd /content/video-retalking

!mkdir ./checkpoints
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/30_net_gen.pth -O ./checkpoints/30_net_gen.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/BFM.zip -O ./checkpoints/BFM.zip
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/DNet.pt -O ./checkpoints/DNet.pt
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/ENet.pth -O ./checkpoints/ENet.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/expression.mat -O ./checkpoints/expression.mat
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/face3d_pretrain_epoch_20.pth -O ./checkpoints/face3d_pretrain_epoch_20.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/GFPGANv1.3.pth -O ./checkpoints/GFPGANv1.3.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/GPEN-BFR-512.pth -O ./checkpoints/GPEN-BFR-512.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/LNet.pth -O ./checkpoints/LNet.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/ParseNet-latest.pth -O ./checkpoints/ParseNet-latest.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/RetinaFace-R50.pth -O ./checkpoints/RetinaFace-R50.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/shape_predictor_68_face_landmarks.dat -O ./checkpoints/shape_predictor_68_face_landmarks.dat
!unzip -d ./checkpoints/BFM ./checkpoints/BFM.zip

In [None]:
%cd /content/

!git clone https://github.com/vinthony/video-retalking.git &> /dev/null

/content


In [9]:
video_path = "/content/Trump_speech.mp4"

In [None]:
# @title Generate video

%cd /content/video-retalking

video_path_fix = f"'../{video_path}'"

!python inference.py \
  --face $video_path \
  --audio "/content/output_synth_audio_final.wav" \
  --outfile '/content/output_high_qual_hi.mp4'

### Normal quality (around 5 min on T4)

In [None]:
# @title Dependencies
%cd /content/

!git clone https://github.com/justinjohn0306/Wav2Lip
!cd Wav2Lip && pip install -r requirements_colab.txt

%cd /content/Wav2Lip

# !wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" -O "face_detection/detection/sfd/s3fd.pth"
# !wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'
# !wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'

!pip install batch-face

In [None]:
# @title Generate video

%cd /content/Wav2Lip

#This is the detection box padding, if you see it doesnt sit quite right, just adjust the values a bit. Usually the bottom one is the biggest issue
pad_top =  0
pad_bottom =  15
pad_left =  0
pad_right =  0
rescaleFactor =  1

video_path_fix = f"'../{video_path}'"

!python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face $video_path_fix --audio "/content/output_synth_fr.wav" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth --outfile '/content/output_video_fr.mp4'


In [None]:
from IPython.display import HTML

# Replace 'your_video_url.mp4' with the actual URL of your video file
video_url = '/content/Trump_speech.mp4'

# Create an HTML code snippet to embed the video player
video_html = f"""
<video width="640" height="480" controls>
  <source src="{video_url}" type="video/mp4">
  Your browser does not support the video tag.
</video>
"""

# Display the HTML code in the notebook
HTML(video_html)



# End

In [None]:
# @title Run this cell to get the video/s and download links
from google.colab import files
from IPython.core.display import display, HTML
import ipywidgets as widgets
import base64
import os

# List of video paths to check
video_paths = ["/content/output_video.mp4", "/content/output_high_qual.mp4"]

# Download function
def download_video(b):
    files.download(b.video_path)

# Create button widget for download
download_buttons = []

# Layout definition for button
button_layout = widgets.Layout(width='250px')

# Loop through each video path to check for existence and display
for video_path in video_paths:
    if os.path.exists(video_path):
        # Encode video to base64
        with open(video_path, "rb") as video_file:
            video_base64 = base64.b64encode(video_file.read()).decode()

        # Create HTML widget for video
        video_html = HTML(data=f"""
        <video width=400 controls>
            <source src="data:video/mp4;base64,{video_base64}" type="video/mp4" />
        </video>
        """)

        # Create button widget for download and link to the video path
        download_button = widgets.Button(description=f"Download {os.path.basename(video_path)}",
                                         layout=button_layout)
        download_button.video_path = video_path
        download_button.on_click(download_video)
        download_buttons.append(download_button)

        # Display widgets
        display(video_html)
        display(download_button)


## Prepare App

In [None]:
# @title Prepare App
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install TTS
!pip install git+https://github.com/openai/whisper.git
# !pip install jiwer
!pip install googletrans==4.0.0-rc1
%cd /content/

!git clone https://github.com/vinthony/video-retalking.git &> /dev/null

!sudo apt-get install -y libblas-dev liblapack-dev libx11-dev libopenblas-dev

!git clone https://github.com/davisking/dlib.git

!pip install basicsr==1.4.2 face-alignment==1.3.4 kornia==0.5.1 ninja==1.10.2.3 einops==0.4.1 facexlib==0.2.5 librosa==0.9.2 build

!cd dlib && python setup.py install

%cd /content/video-retalking

!mkdir ./checkpoints
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/30_net_gen.pth -O ./checkpoints/30_net_gen.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/BFM.zip -O ./checkpoints/BFM.zip
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/DNet.pt -O ./checkpoints/DNet.pt
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/ENet.pth -O ./checkpoints/ENet.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/expression.mat -O ./checkpoints/expression.mat
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/face3d_pretrain_epoch_20.pth -O ./checkpoints/face3d_pretrain_epoch_20.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/GFPGANv1.3.pth -O ./checkpoints/GFPGANv1.3.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/GPEN-BFR-512.pth -O ./checkpoints/GPEN-BFR-512.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/LNet.pth -O ./checkpoints/LNet.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/ParseNet-latest.pth -O ./checkpoints/ParseNet-latest.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/RetinaFace-R50.pth -O ./checkpoints/RetinaFace-R50.pth
!wget https://github.com/vinthony/video-retalking/releases/download/v0.0.1/shape_predictor_68_face_landmarks.dat -O ./checkpoints/shape_predictor_68_face_landmarks.dat
!unzip -d ./checkpoints/BFM ./checkpoints/BFM.zip
%cd /content/

!git clone https://github.com/vinthony/video-retalking.git &> /dev/null

## Main App

In [None]:
# @title Main App
from flask import Flask, request, jsonify,send_from_directory
from werkzeug.utils import secure_filename
import os
import subprocess
import whisper
from googletrans import Translator
from TTS.api import TTS
import torch
app = Flask(__name__)
import threading
PORT=5005
import requests
response = requests.get('https://raw.githubusercontent.com/iamthehimansh/Lip2Lip/main/templates/index.html')
os.makedirs('templates', exist_ok=True)
with open('templates/index.html', 'w') as f:
    f.write(response.text)
def func():
    !ssh -o "StrictHostKeyChecking=no" -R 80:localhost:{PORT} serveo.net
threading.Thread(target=func,daemon=True).start()
# Set the upload folder for video and audio files
app.config['UPLOAD_FOLDER'] = 'uploads'
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

@app.route('/process', methods=['POST'])
def generate_video():
    # Check if the request contains video, audio, and text files
    if 'video' not in request.files or 'audio' not in request.files or 'text' not in request.form:
        return jsonify({'error': 'Video, audio, and text files are required.'}), 400

    video_file = request.files['video']
    audio_file = request.files['audio']
    text = request.form['text']

    # Save the video and audio files to the upload folder
    video_filename = secure_filename(video_file.filename)
    audio_filename = secure_filename(audio_file.filename)
    video_path = os.path.join(app.config['UPLOAD_FOLDER'], video_filename)
    audio_path = os.path.join(app.config['UPLOAD_FOLDER'], audio_filename)
    video_file.save(video_path)
    audio_file.save(audio_path)

    # Generate the video based on the provided files and text
    # Replace this code with your video generation logic
    generated_video_path = generate_video_function(video_path, audio_path, text)

    # Check if the video generation was successful
    if generated_video_path is None:
        return jsonify({'error': 'Failed to generate the video.'}), 500

    # Return the generated video file
    return jsonify({'video_path': generated_video_path}), 200

def generate_video_function(video_path, audio_path_, text_):
    resized_video_path = resize_video(video_path)
    
    # @title Audio extraction (24 bit) and whisper conversion
    

    # Ensure video_path variable exists and is not None
    if resized_video_path and resized_video_path is not None:
        print("here")
        ffmpeg_command = f"ffmpeg -i '{resized_video_path}' -acodec pcm_s24le -ar 48000 -q:a 0 -map a -y 'output_audio_1.wav'"
        subprocess.run(ffmpeg_command, shell=True)
    else:
        print("No video uploaded. Please upload a video first.")


    # model = whisper.load_model("base")
    # result = model.transcribe("output_audio_1.wav")

    # whisper_text = result["text"]
    # whisper_language = result['language']

    # print("Whisper text:", whisper_text)
    # #@title Translation with Google Translate
    # # Mapping between full names and ISO 639-1 codes
    # language_mapping = {
    #     'English': 'en',
    #     'Spanish': 'es',
    #     'French': 'fr',
    #     'German': 'de',
    #     'Italian': 'it',
    #     'Portuguese': 'pt',
    #     'Polish': 'pl',
    #     'Turkish': 'tr',
    #     'Russian': 'ru',
    #     'Dutch': 'nl',
    #     'Czech': 'cs',
    #     'Arabic': 'ar',
    #     'Chinese (Simplified)': 'zh-cn',
    #     'Hindi': 'hi',
    # }

    # # Dropdown with full names
    # target_language = "Hindi" #@param ["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Hindi"]

    # # Convert full name to ISO 639-1 code
    # target_language_code = language_mapping[target_language]

    # Assume whisper_text and whisper_language are defined from previous code

    # Initialize the translator
    # translator = Translator()

    # # Translate the text
    # translated_text = translator.translate(whisper_text, src=whisper_language, dest=target_language_code).text

    # # Output the translated text
    # print("Translated text:", translated_text)
    # # @title split texts into 250 character chunks (Hindi)
    # text_chunks = translated_text.split(sep = "।")
    # final_chunks = [""]
    # for chunk in text_chunks:
    # if not final_chunks[-1] or len(final_chunks[-1])+len(chunk)<250:
    #     chunk += "।"
    #     final_chunks[-1]+=chunk.strip()
    # else:
    #     final_chunks.append(chunk+"।".strip())
    # @title Voice synthesis
    

    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Initialize TTS
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

    # Generate audio file
    # tts.tts_to_file(speech_fr,
    #     speaker_wav='output_audio_1.wav',
    #     file_path="output_synth_fr.wav",
    #     language="fr"
    # )
    # # @title Voice synthesis for text chunks
    # def audio_synthesis(text, file_name):
    #     tts.tts_to_file(
    #         text,
    #         speaker_wav='output_audio_1.wav',
    #         file_path=file_name,
    #         language="hi"
    #     )
    #     return file_name
    # file_names = []
    # for i in range(len(final_chunks)):
    #     file_name = audio_synthesis(final_chunks[i], f"output_synth_audio_{i}.wav")
    #     file_names.append(file_name)

    # create a text file and add paths to files to be merged()
    # %rm -rf my_files.txt
    # %touch my_files.txt

    # #concat audios
    # cmd = "ffmpeg -f concat -safe 0 -i my_files.txt -c copy output_synth_audio_final.wav"
    # subprocess.run(cmd, shell=True)
    mutual_fund_speech=text_
    %rm -rf /content/output_synth_audio_final.wav
    tts.tts_to_file(mutual_fund_speech,
        speaker_wav=audio_path_,
        file_path="output_synth_audio_final.wav",
        language="en"
        )
    try:
        del tts
    except NameError:
        print("Voice model already deleted")

    try:
        del model
    except NameError:
        print("Whisper model already deleted")

    torch.cuda.empty_cache()
    
    %cd /content/video-retalking

    video_path_fix = f"'../{resized_video_path}'"
    %rm -rf /content/output_high_qual_hi.mp4
    !python inference.py \
    --face $video_path \
    --audio "/content/output_synth_audio_final.wav" \
    --outfile '/content/output_high_qual_hi.mp4'
    return send_from_directory('/content', 'output_high_qual_hi.mp4')

if __name__ == '__main__':
    app.run(host="0.0.0.0",port=PORT)