# Set up

In [1]:
import os
from pathlib import Path

from natsort import natsorted
import torch

from src.alacen.alacen import ALACen
from src.alacen.asr.whisper import Whisper
from src.alacen.paraphrase.pegasus import PegasusAlacen
from src.alacen.tts.voicecraft.voicecraft import VoiceCraftTTS, VoiceCraftArgs
from src.alacen.lipsync.diff2lip.diff2lip import Diff2Lip, Diff2LipArgs

device = "cuda" if torch.cuda.is_available() else "cpu"

asr = Whisper()
paraphrase = PegasusAlacen()
tts = VoiceCraftTTS(model_name="330M_TTSEnhanced")
lipsync = Diff2Lip(Diff2LipArgs(num_gpus=3))

alacen = ALACen(asr, paraphrase, tts, lipsync)

  from .autonotebook import tqdm as notebook_tqdm
Dora directory: /tmp/audiocraft_20200884


# Configure

In [2]:
VERBOSE = True
VIDEO_DIR = Path("videos")
OUT_DIR = Path("output")
NUM_PARAPHRASES = 5
TTS_ARGS = VoiceCraftArgs.constructor(padding="end", num_samples=5)

In [3]:
video_list = natsorted(
    [f for f in os.listdir(VIDEO_DIR) if os.path.isfile(VIDEO_DIR / f)]
)
video_list = [VIDEO_DIR / f for f in video_list]
video_list

[PosixPath('videos/vid2.mp4'), PosixPath('videos/vid2_1.mp4')]

# Mode: fully automatic

In [4]:
mode = "auto"
for i, video in enumerate(video_list, 1):
    print(f"Video {i}: {video}", flush=True)
    alacen.run(
        video,
        OUT_DIR / mode,
        TTS_ARGS,
        num_paraphrases=NUM_PARAPHRASES,
        merge_av=True,
        mode=mode,
        device=device,
        verbose=VERBOSE,
        clean_up=True,
    )
    print()

Video 1: videos/vid2.mp4


[2024-06-01 03:37:32,320 | alacen | DEBUG] Extracting audio from video...
[2024-06-01 03:37:32,742 | alacen | DEBUG] Performing speech recognition...
[2024-06-01 03:37:37,134 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-06-01 03:37:37,137 | alacen | DEBUG] Generating paraphrase...


Selected paraphrase: If I ever encounter one of these again, I will be compelled to reconsider my approach to life.


[2024-06-01 03:37:40,179 | alacen | DEBUG] Generating new audio...
[2;36m [0m[36mDEBUG   [0m Beginning run for vid2                                                
[2;36m [0m[36mDEBUG   [0m Using [32m"global"[0m profile                                                
[2;36m [0m[36mDEBUG   [0m Using multiprocessing with [1;36m1[0m                                          
[2;36m [0m[36mDEBUG   [0m Set up logger for MFA version: [1;36m2.2[0m.[1;36m17[0m                                 
[2;36m [0m[36mDEBUG   [0m Cleaned previous run                                                  
[2;36m [0m[36mDEBUG   [0m There were some differences in the current run compared to the last   
[2;36m [0m         one. This may cause issues, run with --clean, if you hit an error.    
[2;36m [0m[36mDEBUG   [0m Using UNKNOWN                                                         
[2;36m [0m[36mDEBUG   [0m Loaded dictionary in [1;36m26.319[0m seconds                

[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/100 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.09864504800000162[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m1[0m jobs in [1;36m0.14182822600000122[0m seconds    
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.293[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.150[0m seconds                                     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m0.263[0m seconds                            
[2;36m [0m[32mINFO    [0m Creating corpus split for feature generation[33m...[0m                       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Created corpus split directory in [1;36m1.069[0m seconds                       
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.233[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.306[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split with features[33m...[0m                                


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.966[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m38.376[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/[0m[95macoustic[0m            
[2;36m [0m[36mDEBUG   [0m Acoustic model dirname:                                               
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/acoustic/[0m[95menglish_us_[0m
[2;36m [0m         [95marpa_acoustic[0m                                                         
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Compiling training graphs took [1;36m1.383[0m seconds                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.818[0m seconds                                    
[2;36m [0m[32mINFO    [0m Calculating fMLLR for speaker adaptation[33m...[0m                           


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Fmllr calculation took [1;36m1.773[0m seconds                                  
[2;36m [0m[32mINFO    [0m Performing second-pass alignment[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.862[0m seconds                                    
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m For job [1;36m0[0m:                                                            
[2;36m [0m[36mDEBUG   [0m [1;36m0[0m beam too narrow                                                     
[2;36m [0m[36mDEBUG   [0m [1;36m652[0m total frames                                                      
[2;36m [0m[36mDEBUG   [0m [1;36m-50.258[0m average log-likelihood                                        
[2;36m [0m[36mDEBUG   [0m Average per frame likelihood for alignment: [1;36m-50.258[0m                   
[2;36m [0m[36mDEBUG   [0m Compiling information took [1;36m1.074[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m12.398[0m seconds                                
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to videos/vid2/mfa_alignments[33m...[0m        
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;3

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h



Generated audio file saved to 'output/auto/vid2_gen_1.wav'


[2024-06-01 03:39:27,069 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  67.95308685302734 ,time without all  gather,  67.10107278823853 ,frames/gpu,  160 ,total frames,  160
(104397,) (160, 540, 960, 3)
(102400,) (160, 540, 960, 3)


[2024-06-01 03:44:32,459 | alacen | DEBUG] Merging generated audio and video...
DEBUG:alacen:Merging generated audio and video...
[2024-06-01 03:44:32,882 | alacen | DEBUG] DONE
DEBUG:alacen:DONE



Video 2: videos/vid2_1.mp4


[2024-06-01 03:44:32,890 | alacen | DEBUG] Extracting audio from video...
DEBUG:alacen:Extracting audio from video...
[2024-06-01 03:44:33,265 | alacen | DEBUG] Performing speech recognition...
DEBUG:alacen:Performing speech recognition...
[2024-06-01 03:44:35,449 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
DEBUG:alacen:Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-06-01 03:44:35,453 | alacen | DEBUG] Generating paraphrase...
DEBUG:alacen:Generating paraphrase...


Selected paraphrase: If I ever find one of these lying around again, I must respectfully decline to engage in such behavior.


[2024-06-01 03:44:38,326 | alacen | DEBUG] Generating new audio...
DEBUG:alacen:Generating new audio...
[2;36m [0m[36mDEBUG   [0m Beginning run for vid2_1                                              
[2;36m [0m[36mDEBUG   [0m Using [32m"global"[0m profile                                                
[2;36m [0m[36mDEBUG   [0m Using multiprocessing with [1;36m1[0m                                          
[2;36m [0m[36mDEBUG   [0m Set up logger for MFA version: [1;36m2.2[0m.[1;36m17[0m                                 
[2;36m [0m[36mDEBUG   [0m Cleaned previous run                                                  
[2;36m [0m[36mDEBUG   [0m There were some differences in the current run compared to the last   
[2;36m [0m         one. This may cause issues, run with --clean, if you hit an error.    
[2;36m [0m[36mDEBUG   [0m Using UNKNOWN                                                         
[2;36m [0m[36mDEBUG   [0m Loaded dictionary in [1;

[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/100 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.09287748599999901[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m1[0m jobs in [1;36m0.13902382299999516[0m seconds    
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.292[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.129[0m seconds                                     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m0.265[0m seconds                            
[2;36m [0m[32mINFO    [0m Creating corpus split for feature generation[33m...[0m                       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Created corpus split directory in [1;36m1.078[0m seconds                       
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.197[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.332[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split with features[33m...[0m                                


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.930[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m38.276[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/[0m[95macoustic[0m            
[2;36m [0m[36mDEBUG   [0m Acoustic model dirname:                                               
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/acoustic/[0m[95menglish_us_[0m
[2;36m [0m         [95marpa_acoustic[0m                                                         
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Compiling training graphs took [1;36m1.369[0m seconds                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.848[0m seconds                                    
[2;36m [0m[32mINFO    [0m Calculating fMLLR for speaker adaptation[33m...[0m                           


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Fmllr calculation took [1;36m1.795[0m seconds                                  
[2;36m [0m[32mINFO    [0m Performing second-pass alignment[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.812[0m seconds                                    
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m For job [1;36m0[0m:                                                            
[2;36m [0m[36mDEBUG   [0m [1;36m0[0m beam too narrow                                                     
[2;36m [0m[36mDEBUG   [0m [1;36m652[0m total frames                                                      
[2;36m [0m[36mDEBUG   [0m [1;36m-50.258[0m average log-likelihood                                        
[2;36m [0m[36mDEBUG   [0m Average per frame likelihood for alignment: [1;36m-50.258[0m                   
[2;36m [0m[36mDEBUG   [0m Compiling information took [1;36m1.084[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m12.340[0m seconds                                
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to videos/vid2_1/mfa_alignments[33m...[0m      
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;3

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h



Generated audio file saved to 'output/auto/vid2_1_gen_1.wav'


[2024-06-01 03:46:29,492 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  68.72399759292603 ,time without all  gather,  67.10439038276672 ,frames/gpu,  162 ,total frames,  162
(111040,) (162, 540, 960, 3)
(103680,) (162, 540, 960, 3)


[2024-06-01 03:51:34,588 | alacen | DEBUG] Merging generated audio and video...
DEBUG:alacen:Merging generated audio and video...
[2024-06-01 03:51:35,036 | alacen | DEBUG] DONE
DEBUG:alacen:DONE





# Mode: semi-automatic

In [5]:
mode = "semi"
for i, video in enumerate(video_list, 1):
    print(f"Video {i}: {video}", flush=True)
    alacen.run(
        video,
        OUT_DIR / mode,
        TTS_ARGS,
        num_paraphrases=NUM_PARAPHRASES,
        merge_av=True,
        mode=mode,
        device=device,
        verbose=VERBOSE,
        clean_up=True,
    )
    print()

Video 1: videos/vid2.mp4


[2024-06-01 03:12:48,064 | alacen | DEBUG] Extracting audio from video...
DEBUG:alacen:Extracting audio from video...
[2024-06-01 03:12:48,415 | alacen | DEBUG] Performing speech recognition...
DEBUG:alacen:Performing speech recognition...
[2024-06-01 03:12:50,510 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
DEBUG:alacen:Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-06-01 03:12:50,512 | alacen | DEBUG] Generating paraphrase...
DEBUG:alacen:Generating paraphrase...


Please choose the best paraphrase among the following:
1. If I ever find one of these lying around again, I swear to myself that I will stop being so polite.
2. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
3. If I ever locate one of these again, I will be compelled to reconsider my approach to this situation.
4. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
5. If I ever encounter one of these individuals again, I would deeply regret to inform you that I am no longer able to display such gracious behavior.
Selected paraphrase: If I ever find one of these lying around again, I swear to God, I will stop being so polite.


[2024-06-01 03:13:04,277 | alacen | DEBUG] Generating new audio...
DEBUG:alacen:Generating new audio...
[2;36m [0m[36mDEBUG   [0m Beginning run for vid2                                                
[2;36m [0m[36mDEBUG   [0m Using [32m"global"[0m profile                                                
[2;36m [0m[36mDEBUG   [0m Using multiprocessing with [1;36m1[0m                                          
[2;36m [0m[36mDEBUG   [0m Set up logger for MFA version: [1;36m2.2[0m.[1;36m17[0m                                 
[2;36m [0m[36mDEBUG   [0m Cleaned previous run                                                  
[2;36m [0m[36mDEBUG   [0m There were some differences in the current run compared to the last   
[2;36m [0m         one. This may cause issues, run with --clean, if you hit an error.    
[2;36m [0m[36mDEBUG   [0m Using UNKNOWN                                                         
[2;36m [0m[36mDEBUG   [0m Loaded dictionary in [1;

[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/100 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.10242348399999912[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m1[0m jobs in [1;36m0.14615926300000126[0m seconds    
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.294[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.138[0m seconds                                     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m0.301[0m seconds                            
[2;36m [0m[32mINFO    [0m Creating corpus split for feature generation[33m...[0m                       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Created corpus split directory in [1;36m1.080[0m seconds                       
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.251[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.290[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split with features[33m...[0m                                


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.970[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m39.201[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/[0m[95macoustic[0m            
[2;36m [0m[36mDEBUG   [0m Acoustic model dirname:                                               
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/acoustic/[0m[95menglish_us_[0m
[2;36m [0m         [95marpa_acoustic[0m                                                         
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Compiling training graphs took [1;36m1.399[0m seconds                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.845[0m seconds                                    
[2;36m [0m[32mINFO    [0m Calculating fMLLR for speaker adaptation[33m...[0m                           


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Fmllr calculation took [1;36m1.829[0m seconds                                  
[2;36m [0m[32mINFO    [0m Performing second-pass alignment[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.895[0m seconds                                    
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m For job [1;36m0[0m:                                                            
[2;36m [0m[36mDEBUG   [0m [1;36m0[0m beam too narrow                                                     
[2;36m [0m[36mDEBUG   [0m [1;36m652[0m total frames                                                      
[2;36m [0m[36mDEBUG   [0m [1;36m-50.258[0m average log-likelihood                                        
[2;36m [0m[36mDEBUG   [0m Average per frame likelihood for alignment: [1;36m-50.258[0m                   
[2;36m [0m[36mDEBUG   [0m Compiling information took [1;36m1.093[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m12.540[0m seconds                                
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to videos/vid2/mfa_alignments[33m...[0m        
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;3

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h



Generated audio files saved to:
  1. output/semi/vid2_gen_1.wav
  2. output/semi/vid2_gen_2.wav
  3. output/semi/vid2_gen_3.wav
  4. output/semi/vid2_gen_4.wav
  5. output/semi/vid2_gen_5.wav


[2024-06-01 03:15:10,116 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  68.96771717071533 ,time without all  gather,  67.08974552154541 ,frames/gpu,  160 ,total frames,  160
(104397,) (160, 540, 960, 3)
(102400,) (160, 540, 960, 3)


[2024-06-01 03:20:15,560 | alacen | DEBUG] Merging generated audio and video...
DEBUG:alacen:Merging generated audio and video...
[2024-06-01 03:20:16,014 | alacen | DEBUG] DONE
DEBUG:alacen:DONE



Video 2: videos/vid2_1.mp4


[2024-06-01 03:20:16,020 | alacen | DEBUG] Extracting audio from video...
DEBUG:alacen:Extracting audio from video...
[2024-06-01 03:20:16,421 | alacen | DEBUG] Performing speech recognition...
DEBUG:alacen:Performing speech recognition...
[2024-06-01 03:20:18,435 | alacen | DEBUG] Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
DEBUG:alacen:Transcript:  If I ever find one of these lying around again, I swear to fucking God, I will stop being so polite.
[2024-06-01 03:20:18,438 | alacen | DEBUG] Generating paraphrase...
DEBUG:alacen:Generating paraphrase...


Please choose the best paraphrase among the following:
1. If I ever find one of these lying around again, I swear to God, I will stop being so polite.
2. If I ever encounter one of these again, I'll be compelled to reconsider my approach.
3. If I ever encounter one of these individuals again, I will be compelled to reconsider my current approach.
4. If I ever find one of these lying around again, I would truly appreciate it if you would stop being so polite.
5. If I ever find one of these lying around again, I vow to stop being so polite.
Selected paraphrase: If I ever find one of these lying around again, I swear to God, I will stop being so polite.


[2024-06-01 03:21:58,199 | alacen | DEBUG] Generating new audio...
DEBUG:alacen:Generating new audio...
[2;36m [0m[36mDEBUG   [0m Beginning run for vid2_1                                              
[2;36m [0m[36mDEBUG   [0m Using [32m"global"[0m profile                                                
[2;36m [0m[36mDEBUG   [0m Using multiprocessing with [1;36m1[0m                                          
[2;36m [0m[36mDEBUG   [0m Set up logger for MFA version: [1;36m2.2[0m.[1;36m17[0m                                 
[2;36m [0m[36mDEBUG   [0m Cleaned previous run                                                  
[2;36m [0m[36mDEBUG   [0m There were some differences in the current run compared to the last   
[2;36m [0m         one. This may cause issues, run with --clean, if you hit an error.    
[2;36m [0m[36mDEBUG   [0m Using UNKNOWN                                                         
[2;36m [0m[36mDEBUG   [0m Loaded dictionary in [1;

[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/100 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.10017157100000418[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m1[0m jobs in [1;36m0.13687058000000007[0m seconds    
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.288[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.156[0m seconds                                     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m0.263[0m seconds                            
[2;36m [0m[32mINFO    [0m Creating corpus split for feature generation[33m...[0m                       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Created corpus split directory in [1;36m1.082[0m seconds                       
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.166[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.273[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split with features[33m...[0m                                


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.846[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m38.204[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/[0m[95macoustic[0m            
[2;36m [0m[36mDEBUG   [0m Acoustic model dirname:                                               
[2;36m [0m         [35m/mnt/home/20200884/Documents/MFA/extracted_models/acoustic/[0m[95menglish_us_[0m
[2;36m [0m         [95marpa_acoustic[0m                                                         
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Compiling training graphs took [1;36m1.388[0m seconds                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.851[0m seconds                                    
[2;36m [0m[32mINFO    [0m Calculating fMLLR for speaker adaptation[33m...[0m                           


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Fmllr calculation took [1;36m1.818[0m seconds                                  
[2;36m [0m[32mINFO    [0m Performing second-pass alignment[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m1.912[0m seconds                                    
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m For job [1;36m0[0m:                                                            
[2;36m [0m[36mDEBUG   [0m [1;36m0[0m beam too narrow                                                     
[2;36m [0m[36mDEBUG   [0m [1;36m652[0m total frames                                                      
[2;36m [0m[36mDEBUG   [0m [1;36m-50.258[0m average log-likelihood                                        
[2;36m [0m[36mDEBUG   [0m Average per frame likelihood for alignment: [1;36m-50.258[0m                   
[2;36m [0m[36mDEBUG   [0m Compiling information took [1;36m1.085[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m12.756[0m seconds                                
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to videos/vid2_1/mfa_alignments[33m...[0m      
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;3

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h



Generated audio files saved to:
  1. output/semi/vid2_1_gen_1.wav
  2. output/semi/vid2_1_gen_2.wav
  3. output/semi/vid2_1_gen_3.wav
  4. output/semi/vid2_1_gen_4.wav
  5. output/semi/vid2_1_gen_5.wav


[2024-06-01 03:23:41,931 | alacen | DEBUG] Generating lip-synced video...
DEBUG:alacen:Generating lip-synced video...


MPI.COMM_WORLD.Get_rank() 0
os.environ["CUDA_VISIBLE_DEVICES"] 0
MPI.COMM_WORLD.Get_rank() 2
os.environ["CUDA_VISIBLE_DEVICES"] 2
MPI.COMM_WORLD.Get_rank() 1
os.environ["CUDA_VISIBLE_DEVICES"] 1
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 32
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 16
Recovering from OOM error; New batch size: 8
Time taken for sampling,  68.47502422332764 ,time without all  gather,  66.95443415641785 ,frames/gpu,  160 ,total frames,  160
(104397,) (160, 540, 960, 3)
(102400,) (160, 540, 960, 3)


[2024-06-01 03:28:45,622 | alacen | DEBUG] Merging generated audio and video...
DEBUG:alacen:Merging generated audio and video...
[2024-06-01 03:28:46,027 | alacen | DEBUG] DONE
DEBUG:alacen:DONE



