# Set up

In [1]:
import os
from pathlib import Path

from natsort import natsorted
import torch

from src.alacen.alacen import ALACen
from src.alacen.asr.whisper import Whisper
from src.alacen.paraphrase.pegasus import PegasusAlacen
from src.alacen.tts.voicecraft.voicecraft import VoiceCraftTTS, VoiceCraftArgs
from src.alacen.lipsync.diff2lip.diff2lip import Diff2Lip, Diff2LipArgs

device = "cuda" if torch.cuda.is_available() else "cpu"

asr = Whisper()
paraphrase = PegasusAlacen()
tts = VoiceCraftTTS(model_name="330M_TTSEnhanced")
lipsync = Diff2Lip(Diff2LipArgs())

alacen = ALACen(asr, paraphrase, tts, lipsync)

  from .autonotebook import tqdm as notebook_tqdm
Dora directory: /tmp/audiocraft_20200884


# Configure

In [2]:
VERBOSE = True
VIDEO_DIR = Path("videos")
OUT_DIR = Path("output")
NUM_PARAPHRASES = 5

In [8]:
video_list = natsorted(
    [f for f in os.listdir(VIDEO_DIR) if os.path.isfile(VIDEO_DIR / f)]
)
video_list = [VIDEO_DIR / f for f in video_list]
video_list

[PosixPath('videos/vid2.mp4'), PosixPath('videos/vid2_1.mp4')]

# Run ALACen

In [9]:
for i, video in enumerate(video_list, 1):
    print(f"Video {i}: {video}")
    alacen.run(
        video,
        OUT_DIR,
        VoiceCraftArgs,
        num_paraphrases=NUM_PARAPHRASES,
        device=device,
        verbose=VERBOSE,
        clean_up=True,
    )
    print()

[2024-05-30 11:19:28,088 | alacen | DEBUG] Extracting audio from video...


Video 1: videos/vid2.mp4


[2024-05-30 11:19:28,490 | alacen | DEBUG] Performing speech recognition...
[2024-05-30 11:19:33,144 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-05-30 11:19:33,147 | alacen | DEBUG] Generating paraphrase...


Please choose the best paraphrase among the following:
1. If I ever find one of these lying around again, I swear to myself, I will stop being so polite.
2. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
3. If I ever encounter one of these individuals again, I will not hesitate to express my strong disapproval.
4. If I ever find one of these again, I swear to God, I will stop being so polite.
5. I'm so used to being so polite, I might as well stop.


[2024-05-30 11:19:42,659 | alacen | DEBUG] Generating new audio...


Selected paraphrase: If I ever find one of these again, I swear to God, I will stop being so polite.
Generated audio file saved to 'output/vid2_gen_seed-1.wav'


[2024-05-30 11:22:13,787 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  48.52311706542969 ,time without all  gather,  47.948182106018066 ,frames/gpu,  109 ,total frames,  109
(71680,) (109, 540, 960, 3)
(69760,) (109, 540, 960, 3)


[2024-05-30 11:26:57,051 | alacen | DEBUG] DONE
DEBUG:alacen:DONE
[2024-05-30 11:26:57,057 | alacen | DEBUG] Extracting audio from video...
DEBUG:alacen:Extracting audio from video...



Video 2: videos/vid2_1.mp4


[2024-05-30 11:26:57,444 | alacen | DEBUG] Performing speech recognition...
DEBUG:alacen:Performing speech recognition...
[2024-05-30 11:26:59,509 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
DEBUG:alacen:Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-05-30 11:26:59,512 | alacen | DEBUG] Generating paraphrase...
DEBUG:alacen:Generating paraphrase...


Please choose the best paraphrase among the following:
1. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
2. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
3. If I ever encounter one of these individuals again, I would respectfully decline to engage in such language.
4. If I ever find one of these lying around again, I vow to stop being so polite.
5. If I ever encounter one of these individuals again, I will be forced to reconsider my approach to this situation.
1. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
2. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
3. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
4. If I ever encounter one of these individuals again, I will be forced to reconsider my approach to life.
5. If I ever find one of these again, I promise

[2024-05-30 11:32:07,996 | alacen | DEBUG] Generating new audio...
DEBUG:alacen:Generating new audio...


Selected paraphrase: If I ever find one of these lying around again, I swear to God, I will stop being so polite.




Generated audio file saved to 'output/vid2_1_gen_seed-1.wav'


[2024-05-30 11:35:23,982 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  34.01254940032959 ,time without all  gather,  33.17655611038208 ,frames/gpu,  78 ,total frames,  78
(52160,) (78, 540, 960, 3)
(49920,) (78, 540, 960, 3)


[2024-05-30 11:39:51,533 | alacen | DEBUG] DONE
DEBUG:alacen:DONE



