In [7]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
from scipy.io.wavfile import write as write_wav
import moviepy.editor as mp
import cv2
import os

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

preload_models()

In [8]:

interval = 5

# 동영상 파일 경로
video_path = "sample.mp4"
cap = cv2.VideoCapture(video_path)

# 프레임 추출 및 캡셔닝
captions = []
frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
frame_interval = frame_rate * interval  # interval초마다 한 프레임씩 캡션 생성

while cap.isOpened():
    frame_id = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    ret, frame = cap.read()
    if not ret:
        break
    
    if frame_id % frame_interval == 0:
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)
        
        generated_ids = model.generate(pixel_values)
        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        captions.append(generated_text)
        
cap.release()

print("caption length: " + str(len(captions)))

caption length: 7


In [9]:

# 캡션을 각각 음성으로 변환하고 저장
audio_files = []
for i, caption in enumerate(captions):
    audio_array = generate_audio(caption, history_prompt="v2/en_speaker_6")
    audio_file = f"audio_{i}.wav"
    write_wav(audio_file, SAMPLE_RATE, audio_array)
    audio_files.append(audio_file)


100%|██████████| 194/194 [00:05<00:00, 36.32it/s]
100%|██████████| 10/10 [00:15<00:00,  1.58s/it]
100%|██████████| 300/300 [00:06<00:00, 44.94it/s]
100%|██████████| 15/15 [00:22<00:00,  1.50s/it]
100%|██████████| 168/168 [00:04<00:00, 38.65it/s]
100%|██████████| 9/9 [00:12<00:00,  1.36s/it]
100%|██████████| 336/336 [00:07<00:00, 44.39it/s]
100%|██████████| 17/17 [00:25<00:00,  1.50s/it]
100%|██████████| 209/209 [00:04<00:00, 43.27it/s]
100%|██████████| 11/11 [00:15<00:00,  1.42s/it]
100%|██████████| 205/205 [00:04<00:00, 41.06it/s]
100%|██████████| 11/11 [00:15<00:00,  1.38s/it]
100%|██████████| 133/133 [00:02<00:00, 45.05it/s]
100%|██████████| 7/7 [00:09<00:00,  1.37s/it]


In [10]:

# 동영상 불러오기
video_clip = mp.VideoFileClip(video_path)

# 각 캡션의 오디오 클립을 생성하고 해당 위치에 추가
audio_clips = []
start_time = 0
for i, audio_file in enumerate(audio_files):
    audio_clip = mp.AudioFileClip(audio_file)
    time = f"0:0:{start_time}"
    audio_clips.append(audio_clip.set_start(time, change_end=True))
    start_time += interval

# 전체 오디오 클립을 결합
final_audio = mp.concatenate_audioclips(audio_clips)

# 동영상에 오디오 추가
final_clip = video_clip.set_audio(final_audio)
final_clip.write_videofile("final_output_video.mp4", codec="libx264")


Moviepy - Building video final_output_video.mp4.
MoviePy - Writing audio in final_output_videoTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video final_output_video.mp4



                                                               

Moviepy - Done !
Moviepy - video ready final_output_video.mp4


In [11]:

# 임시 오디오 파일 삭제
for audio_file in audio_files:
    os.remove(audio_file)