# Set up

In [1]:
import os
from pathlib import Path

from natsort import natsorted
import torch

from src.alacen.alacen import ALACen
from src.alacen.asr.whisper import Whisper
from src.alacen.paraphrase.pegasus import PegasusAlacen
from src.alacen.tts.voicecraft.voicecraft import VoiceCraftTTS, VoiceCraftArgs
from src.alacen.lipsync.diff2lip.diff2lip import Diff2Lip, Diff2LipArgs

device = "cuda:3" if torch.cuda.is_available() else "cpu"

asr = Whisper()
paraphrase = PegasusAlacen()
tts = VoiceCraftTTS(model_name="330M_TTSEnhanced")
lipsync = Diff2Lip(Diff2LipArgs(num_gpus=3))

alacen = ALACen(asr, paraphrase, tts, lipsync)

  from .autonotebook import tqdm as notebook_tqdm
Dora directory: /tmp/audiocraft_20200884


# Configure

In [2]:
VERBOSE = True
VIDEO_DIR = Path("demo")
OUT_DIR = Path("output")
NUM_PARAPHRASES = 5
NUM_SPEECHES = 5
MODE = "semi"

In [3]:
video_list = natsorted(
    [f for f in os.listdir(VIDEO_DIR) if os.path.isfile(VIDEO_DIR / f)]
)
video_list = [VIDEO_DIR / f for f in video_list]
video_list

[PosixPath('demo/vid32.mp4')]

# Run ALACen

In [4]:
for i, video in enumerate(video_list, 1):
    print(f"Video {i}: {video}")
    alacen.run(
        video,
        OUT_DIR,
        VoiceCraftArgs.constructor(padding="end", num_samples=NUM_SPEECHES),
        num_paraphrases=NUM_PARAPHRASES,
        device=device,
        mode=MODE,
        verbose=VERBOSE,
        clean_up=True,
    )
    print()

[2024-06-17 22:15:56,935 | alacen.alacen.session | DEBUG] Extracting audio from video...


Video 1: demo/vid32.mp4


[2024-06-17 22:15:57,259 | alacen.alacen.session | DEBUG] Performing speech recognition...
[2024-06-17 22:16:01,823 | alacen.alacen.session | DEBUG] Transcript:  smoked so much weed he actually had to write a song called, hey, what's my motherfucking name?
[2024-06-17 22:16:01,825 | alacen.alacen.session | DEBUG] Generating paraphrase...


Please choose the best paraphrase among the following:
1. He used a lot of marijuana, and it made it difficult for him to write a song.
2. He was so high on marijuana that he had to write a song called "Hey, what's my mother's maiden name?"
3. He struggled with excessive use of marijuana, and it led to the creation of a song called "I'm Yours."
4. He used a lot of marijuana, and he actually had to write a song called "I'm not sure what my name is."
5. He used a lot of marijuana, and it led to the creation of a song called "Hey, I'm not sure what my name is."
Selected paraphrase: He was so high on marijuana that he had to write a song called "Hey, what's my mother's maiden name?"


[2024-06-17 22:16:25,111 | alacen.alacen.session | DEBUG] Generating new audio...


Generated audio files saved to:
  1. output/vid32_gen_1.wav
  2. output/vid32_gen_2.wav
  3. output/vid32_gen_3.wav
  4. output/vid32_gen_4.wav
  5. output/vid32_gen_5.wav


[2024-06-17 22:18:19,733 | alacen.alacen.session | DEBUG] Generating lip-synced video...
DEBUG:alacen.alacen.session:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Recovering from OOM error; New batch size: 8
Recovering from OOM error; New batch size: 8
Time taken for sampling,  54.34663701057434 ,time without all  gather,  50.214728116989136 ,frames/gpu,  120 ,total frames,  120
(88000,) (120, 720, 1280, 3)
(76800,) (120, 720, 1280, 3)


[2024-06-17 22:22:08,574 | alacen.alacen.session | DEBUG] Merging generated audio and video...
DEBUG:alacen.alacen.session:Merging generated audio and video...



