<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Video_Translation_with_LipSync_Public_Kristin_frac_tl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In order to run this, you will need add your ElevenLabs API key and your OpenAI API key. Then run each cell in order and download the final video after everything is done running. Long videos will take a long time, roughly 5min per min of video depending on the computer Google assigns to you in Colab.

In [None]:
!git clone https://github.com/ajay-sainy/Wav2Lip-GFPGAN.git
basePath = "/content/Wav2Lip-GFPGAN"
%cd {basePath}

In [None]:
wav2lipFolderName = 'Wav2Lip-master'
gfpganFolderName = 'GFPGAN-master'
wav2lipPath = basePath + '/' + wav2lipFolderName
gfpganPath = basePath + '/' + gfpganFolderName

!wget 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth' -O {wav2lipPath}'/face_detection/detection/sfd/s3fd.pth'
!gdown https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q --output {wav2lipPath}'/checkpoints/'

In [None]:
!pip install -r requirements.txt
!pip install openai
!pip install --upgrade pytube
!pip install pydub
!pip install transformers
!pip install -U transformers
!pip install transformers sentencepiece
!pip install --upgrade pytube
!pip install elevenlabslib
!sudo apt-get install libportaudio2

In [None]:
import openai
import pandas as pd
from pytube import YouTube
from transformers import T5Tokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import GPT2TokenizerFast
from transformers import pipeline
import textwrap
from concurrent.futures import ThreadPoolExecutor
import logging
import warnings



# Supress warnings
logging.basicConfig(level=logging.CRITICAL)
warnings.filterwarnings("ignore")

# OpenAI API key
openai.api_key = "Your OpenAI API Key"

def get_transcript(youtube_link, resolution='720p'):
    # Download the audio from the YouTube video
    yt = YouTube(youtube_link)
    audio = yt.streams.filter(only_audio=True).first()
    video = yt.streams.filter(res=resolution, only_video=True).first()

    audio_file = audio.download(filename='/content/Wav2Lip-GFPGAN/inputs/j_audio.mp3')
    video_file = video.download(filename='/content/Wav2Lip-GFPGAN/inputs/j_video.mp4')

    # Translate the audio using OpenAI API
    with open(audio_file, 'rb') as file:
        response = openai.Audio.translate('whisper-1', file)

    # Return the translated transcript
    return response['text']



def count_tokens(input_data, max_tokens=20000, input_type='text'):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

    if input_type == 'text':
        tokens = tokenizer.tokenize(input_data)
    elif input_type == 'tokens':
        tokens = input_data
    else:
        raise ValueError("Invalid input_type. Must be 'text' or 'tokens'")

    # Print the number of tokens
    token_count = len(tokens)
    return token_count



def truncate_text_by_tokens(text, max_tokens=3000):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

    # Tokenize the input text
    tokens = tokenizer.tokenize(text)

    # Truncate tokens to final_max_tokens
    truncated_tokens = tokens[:max_tokens]

    trunc_token_len = count_tokens(truncated_tokens, input_type='tokens')

    print("Transcript Token Length:"+ str(trunc_token_len))

    # Convert the truncated tokens back to text
    truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)

    return truncated_text





def gpt_translate_transcript(transcript_text,token_len):
    # Check the length of the transcript
      transcript_text_truncated = truncate_text_by_tokens(transcript_text,token_len)
      # Generate the summary using the OpenAI ChatCompletion API
      response = openai.ChatCompletion.create(
          model="gpt-4",
          messages=[
              {"role": "system", "content": "You are an expert at Accurately translating text and preserving all nuance."},
              {"role": "user", "content": "I have a long transcript that I would like you to translate for me. Please think carefully and do the best job you possibly can."},
              {"role": "system", "content": "Absolutely, I will provide a concise and comprehensive summary of the transcript."},
              {"role": "user", "content": "Excellent, here is the transcript. Please translate into French: " + transcript_text_truncated + "Translation:\n"}
          ],
          max_tokens=4000,
          n=1,
          stop=None,
          temperature=0.5,
      )

      # Extract the generated summary from the response
      summary = response['choices'][0]['message']['content']
      print("summarized by GPT3")

      with open("/content/Wav2Lip-GFPGAN/inputs/transcript_summary.txt", "w") as file:
        file.write(summary)


      # Return the summary
      return summary.strip()




# Get the transcript from the video
transcription = get_transcript("https://www.youtube.com/watch?v=m-UvMLEcJF8")
translated = gpt_translate_transcript(transcription,5000)
# Get the token length of the transcript
token_count = count_tokens(transcription)
print(token_count)








In [None]:
import os
from elevenlabslib import ElevenLabsUser
from IPython.display import Audio
from google.colab import files
from elevenlabslib.helpers import *

# Your API key
apiKey = "Your ElevenLabs API Key"

def main():
    # Create the user object
    user = ElevenLabsUser(apiKey)

    # Delete voices if they already exist
    try:
        user.get_voices_by_name("ClonedVoiceTest")[0].delete_voice()
        print("Voice found and deleted.")
    except IndexError:
        print("Voice not found, no need to delete it.")

    # Upload the MP4 file
    # Upload the MP4 file
    mp4_file_path = '/content/Wav2Lip-GFPGAN/inputs/j_audio.mp3'
    sample_path = os.path.basename(mp4_file_path)
    with open(mp4_file_path, 'rb') as file:
        sample_bytes = file.read()

    # Create the new cloned voice by uploading the sample as bytes
    new_cloned_voice = user.clone_voice_bytes("ClonedVoiceTest", {sample_path: sample_bytes})


    with open("/content/Wav2Lip-GFPGAN/inputs/transcript_summary.txt", "r") as file:
        transcript = file.read()

    # Generate audio from text
    text = transcript
    audio_data = new_cloned_voice.generate_audio_v2(text)[0]

    # Save audio to file
    audio_output_path = "/content/Wav2Lip-GFPGAN/inputs/output.wav"
    save_audio_bytes(audio_data, audio_output_path, "wav")

    # Display audio
    audio = Audio(audio_output_path)
    display(audio)

if __name__ == "__main__":
    main()


Voice not found, no need to delete it.


In [None]:
import os
outputPath = basePath+'/outputs'
inputAudioPath = basePath + '/inputs/output.mp3'
inputVideoPath = basePath + '/inputs/j_video.mp4'
lipSyncedOutputPath = basePath + '/outputs/result.mp4'

if not os.path.exists(outputPath):
  os.makedirs(outputPath)

!cd $wav2lipFolderName && python inference.py \
--checkpoint_path checkpoints/wav2lip.pth \
--face {inputVideoPath} \
--audio {inputAudioPath} \
--outfile {lipSyncedOutputPath}

In [None]:
!cd $gfpganFolderName && python setup.py develop
!wget https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth -P {gfpganFolderName}'/experiments/pretrained_models'

In [None]:
import cv2
from tqdm import tqdm
from os import path

import os

inputVideoPath = outputPath+'/result.mp4'
unProcessedFramesFolderPath = outputPath+'/frames'

if not os.path.exists(unProcessedFramesFolderPath):
  os.makedirs(unProcessedFramesFolderPath)

vidcap = cv2.VideoCapture(inputVideoPath)
numberOfFrames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = vidcap.get(cv2.CAP_PROP_FPS)
print("FPS: ", fps, "Frames: ", numberOfFrames)

for frameNumber in tqdm(range(numberOfFrames)):
    _,image = vidcap.read()
    cv2.imwrite(path.join(unProcessedFramesFolderPath, str(frameNumber).zfill(4)+'.jpg'), image)


In [None]:
!cd $gfpganFolderName && \
  python inference_gfpgan.py -i $unProcessedFramesFolderPath -o $outputPath -v 1.3 -s 2 --only_center_face --bg_upsampler None

In [None]:
import os
restoredFramesPath = outputPath + '/restored_imgs/'
processedVideoOutputPath = outputPath

dir_list = os.listdir(restoredFramesPath)
dir_list.sort()

import cv2
import numpy as np

batch = 0
batchSize = 300
from tqdm import tqdm
for i in tqdm(range(0, len(dir_list), batchSize)):
  img_array = []
  start, end = i, i+batchSize
  print("processing ", start, end)
  for filename in  tqdm(dir_list[start:end]):
      filename = restoredFramesPath+filename;
      img = cv2.imread(filename)
      if img is None:
        continue
      height, width, layers = img.shape
      size = (width,height)
      img_array.append(img)


  out = cv2.VideoWriter(processedVideoOutputPath+'/batch_'+str(batch).zfill(4)+'.avi',cv2.VideoWriter_fourcc(*'DIVX'), 30, size)
  batch = batch + 1

  for i in range(len(img_array)):
    out.write(img_array[i])
  out.release()


In [None]:
concatTextFilePath = outputPath + "/concat.txt"
concatTextFile=open(concatTextFilePath,"w")
for ips in range(batch):
  concatTextFile.write("file batch_" + str(ips).zfill(4) + ".avi\n")
concatTextFile.close()

concatedVideoOutputPath = outputPath + "/concated_output.avi"
!ffmpeg -y -f concat -i {concatTextFilePath} -c copy {concatedVideoOutputPath}

finalProcessedOuputVideo = processedVideoOutputPath+'/final_with_audio.avi'
!ffmpeg -y -i {concatedVideoOutputPath} -i {inputAudioPath} -map 0 -map 1:a -c:v copy -shortest {finalProcessedOuputVideo}

from google.colab import files
files.download(finalProcessedOuputVideo)

To download your video, find and download this file from the folders on the left: /content/Wav2Lip-GFPGAN/outputs/final_with_audio.avi