## Generating Text Description

Author: Jaivin Jacob

In [None]:
!pip install torch torchvision transformers opencv-python pandas tqdm

In [3]:
import os
import cv2
import pandas as pd
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from tqdm import tqdm

In [4]:
# # Load CSV file
# df = pd.read_csv('movie_description.csv')

# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    frames = []
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)
    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(cv2.resize(frame, (224, 224)))
        if len(frames) == num_frames:
            break
    cap.release()
    return frames

In [5]:
# Load pre-trained model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [6]:
# Function to generate captions
def generate_caption(model, feature_extractor, tokenizer, frames):
    inputs = feature_extractor(images=frames, return_tensors="pt")
    pixel_values = inputs.pixel_values
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [7]:
# Function to generate caption for a specific video
def generate_caption_for_video(video_path, model, feature_extractor, tokenizer):
    frames = extract_frames(video_path)
    caption = generate_caption(model, feature_extractor, tokenizer, frames)
    return caption

# Example usage:
video_path = 'sample_video_1.mp4'  # Specify the path to the video
caption = generate_caption_for_video(video_path, model, feature_extractor, tokenizer)
print(f"Generated Caption: {caption}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Generated Caption: a dirt road with a few trees and a few cars 


## Generating Auido From Caption

In [None]:
!pip install gtts

In [None]:
!pip install moviepy pydub speechrecognition

In [8]:
caption

'a dirt road with a few trees and a few cars '

In [9]:
from gtts import gTTS
import os

# Language in which you want to convert
language = 'en'

# Creating an instance of gTTS
speech = gTTS(text=caption, lang=language, slow=False)

# Saving the converted audio in a mp3 file named "caption.mp3"
speech.save("caption.mp3")

# # Playing the converted file 
# os.system("start caption.mp3")

## Placing Audio Description on Video

In [16]:
from moviepy.editor import VideoFileClip, concatenate_videoclips, AudioFileClip
from pydub import AudioSegment, silence
import moviepy.video.fx.all as vfx
import speech_recognition as sr

In [2]:
# Step 2: Load the video and its audio
video = VideoFileClip("sample_video_1.mp4")
audio = AudioSegment.from_file("sample_video_1.mp4", "mp4")

In [10]:
# Step 3: Find silent parts in the audio
silent_ranges = silence.detect_silence(audio, min_silence_len=1000, silence_thresh=audio.dBFS-16)
silent_ranges = [((start/1000), (stop/1000)) for start, stop in silent_ranges] 

In [11]:
# Step 4: Insert the audio description at the first silent part (if available)
if silent_ranges:
    start_silent, end_silent = silent_ranges[0]
    print(f"Silent part found from {start_silent} to {end_silent} seconds")

    # Step 5: Load the audio description
    caption_audio = AudioFileClip("caption.mp3")

    # Make sure the audio fits within the silent part
    duration = min(caption_audio.duration, end_silent - start_silent)

    # Trim or extend the caption audio to fit the silent part
    caption_audio = caption_audio.subclip(0, duration)

    # Step 6: Split the video into three parts: before, silent part, and after
    video_before = video.subclip(0, start_silent)
    video_silent = video.subclip(start_silent, start_silent + duration)
    video_after = video.subclip(start_silent + duration)

    # Replace the silent part's audio with the caption audio
    video_silent = video_silent.set_audio(caption_audio)

    # Concatenate the video parts back together
    final_video = concatenate_videoclips([video_before, video_silent, video_after])

    # Step 7: Save the final video
    final_video.write_videofile("final_video_with_caption.mp4", codec="libx264", audio_codec="aac")
else:
    print("No silent part found in the video.")

Silent part found from 0.0 to 3.377 seconds
Moviepy - Building video final_video_with_caption.mp4.
MoviePy - Writing audio in final_video_with_captionTEMP_MPY_wvf_snd.mp4


                                                                   

MoviePy - Done.
Moviepy - Writing video final_video_with_caption.mp4



                                                              

Moviepy - Done !
Moviepy - video ready final_video_with_caption.mp4


## Handling Videos With Background Music

In [19]:
from moviepy.editor import VideoFileClip, concatenate_videoclips, AudioFileClip
from pydub import AudioSegment
import speech_recognition as sr
import os
from gtts import gTTS

# Ensure ffmpeg is installed and available in the system PATH
assert os.system('ffmpeg -version') == 0, "FFmpeg not found, please install and add it to the system PATH"

# Load the video and extract audio
video_path = "sample_video_1.mp4"
video = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
video.audio.write_audiofile(audio_path)

# Load the audio using pydub
audio = AudioSegment.from_file(audio_path, format="wav")

# Initialize the recognizer
recognizer = sr.Recognizer()

# Define the step (chunk size) in milliseconds
step = 1000  # 1 second

# List to store non-speech ranges
non_speech_ranges = []

# Loop through the audio file in chunks
for i in range(0, len(audio), step):
    chunk = audio[i:i + step]
    chunk_path = f"chunk_{i // step}.wav"
    chunk.export(chunk_path, format="wav")
    
    # Recognize speech in the chunk
    with sr.AudioFile(chunk_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            print(f"Speech detected: '{text}' at {i / 1000}s to {(i + step) / 1000}s")
        except sr.UnknownValueError:
            print(f"No speech detected at {i / 1000}s to {(i + step) / 1000}s")
            non_speech_ranges.append((i / 1000, (i + step) / 1000))
        except sr.RequestError as e:
            print(f"Could not request results; {e}")
    
    # Clean up the chunk file
    os.remove(chunk_path)

# Clean up the extracted audio file
os.remove(audio_path)

# Combine consecutive non-speech ranges if they are continuous
combined_non_speech_ranges = []
current_range = None

for start, end in non_speech_ranges:
    if current_range is None:
        current_range = [start, end]
    else:
        if start == current_range[1]:
            current_range[1] = end
        else:
            combined_non_speech_ranges.append(tuple(current_range))
            current_range = [start, end]

if current_range is not None:
    combined_non_speech_ranges.append(tuple(current_range))

# Display non-speech ranges
print("Non-speech ranges detected:", combined_non_speech_ranges)

# # Convert text to speech and save it as an audio file
# caption = "Your text description about the video goes here."
# language = 'en'
# speech = gTTS(text=caption, lang=language, slow=False)
# speech.save("caption.mp3")

# Load the audio description
caption_audio = AudioFileClip("caption.mp3")
caption_duration = caption_audio.duration

# Find a suitable non-speech range
suitable_range = None
for start, end in combined_non_speech_ranges:
    if (end - start) >= caption_duration:
        suitable_range = (start, end)
        break

# Insert audio description into the suitable non-speech range
if suitable_range:
    start_non_speech, end_non_speech = suitable_range
    print(f"Inserting audio description into non-speech part from {start_non_speech} to {end_non_speech} seconds")

    # Trim or extend the caption audio to fit the non-speech part
    caption_audio = caption_audio.subclip(0, caption_duration)

    # Split the video into three parts: before, non-speech part, and after
    video_before = video.subclip(0, start_non_speech)
    video_non_speech = video.subclip(start_non_speech, start_non_speech + caption_duration)
    video_after = video.subclip(start_non_speech + caption_duration)

    # Replace the non-speech part's audio with the caption audio
    video_non_speech = video_non_speech.set_audio(caption_audio)

    # Concatenate the video parts back together
    final_video = concatenate_videoclips([video_before, video_non_speech, video_after])

    # Save the final video
    final_video.write_videofile("final_video_with_caption.mp4", codec="libx264", audio_codec="aac")
else:
    print("No suitable non-speech part found in the video.")

                                                                
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:21<01:22, 41.00s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:45<02:41, 40.28s/it, now=None][A

chunk:  78%|███████▊  | 18/23 [11:36<03:13, 38.68s/it, now=None][A[A

MoviePy - Writing audio in temp_audio.wav





chunk:   0%|          | 0/241 [00:00<?, ?it/s, now=None][A[A[A


                                                                A
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:21<01:22, 41.01s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:45<02:41, 40.29s/it, now=None][A

chunk:  78%|███████▊  | 18/23 [11:36<03:13, 38.69s/it, now=None][A[A

MoviePy - Done.
No speech detected at 0.0s to 1.0s
No speech detected at 1.0s to 2.0s
No speech detected at 2.0s to 3.0s
No speech detected at 3.0s to 4.0s
Speech detected: 'it's awesome' at 4.0s to 5.0s
No speech detected at 5.0s to 6.0s
No speech detected at 6.0s to 7.0s
No speech detected at 7.0s to 8.0s
No speech detected at 8.0s to 9.0s
No speech detected at 9.0s to 10.0s
No speech detected at 10.0s to 11.0s
Non-speech ranges detected: [(0.0, 4.0), (5.0, 11.0)]
Inserting audio description into non-speech part from 0.0 to 4.0 seconds


                                                                
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:27<01:22, 41.32s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:52<02:42, 40.64s/it, now=None][A

                                                                [A[A
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:27<01:22, 41.32s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:52<02:42, 40.64s/it, now=None][A

chunk:  78%|███████▊  | 18/23 [11:43<03:15, 39.06s/it, now=None][A[A

Moviepy - Building video final_video_with_caption.mp4.
MoviePy - Writing audio in final_video_with_captionTEMP_MPY_wvf_snd.mp4





chunk:   0%|          | 0/241 [00:00<?, ?it/s, now=None][A[A[A


chunk:  12%|█▏        | 28/241 [00:00<00:00, 269.86it/s, now=None][A[A[A


chunk:  31%|███       | 75/241 [00:00<00:00, 357.50it/s, now=None][A[A[A


                                                                  [A[A[A
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:28<01:22, 41.34s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:52<02:42, 40.65s/it, now=None][A

                                                                [A[A
[A                                                             

chunk:  91%|█████████▏| 21/23 [14:28<01:22, 41.34s/it, now=None]
chunk:  83%|████████▎ | 19/23 [12:52<02:42, 40.65s/it, now=None][A

chunk:  78%|███████▊  | 18/23 [11:43<03:15, 39.08s/it, now=None][A[A

MoviePy - Done.
Moviepy - Writing video final_video_with_caption.mp4






t:   0%|          | 0/328 [00:00<?, ?it/s, now=None][A[A[A


t:   2%|▏         | 7/328 [00:00<00:05, 63.81it/s, now=None][A[A[A


t:   5%|▍         | 15/328 [00:00<00:04, 72.26it/s, now=None][A[A[A


t:   7%|▋         | 24/328 [00:00<00:03, 77.49it/s, now=None][A[A[A


t:  10%|▉         | 32/328 [00:00<00:03, 77.06it/s, now=None][A[A[A


t:  12%|█▏        | 40/328 [00:00<00:04, 71.68it/s, now=None][A[A[A


t:  15%|█▍        | 48/328 [00:00<00:04, 65.90it/s, now=None][A[A[A


t:  17%|█▋        | 55/328 [00:00<00:04, 61.53it/s, now=None][A[A[A


t:  19%|█▉        | 62/328 [00:00<00:04, 54.06it/s, now=None][A[A[A


t:  21%|██        | 68/328 [00:01<00:05, 51.27it/s, now=None][A[A[A


t:  23%|██▎       | 74/328 [00:01<00:05, 45.42it/s, now=None][A[A[A


t:  24%|██▍       | 79/328 [00:01<00:05, 44.46it/s, now=None][A[A[A


t:  26%|██▌       | 84/328 [00:01<00:05, 44.44it/s, now=None][A[A[A


t:  27%|██▋       | 89/328 [00:01<00:05, 43.99it/s, now=No

Moviepy - Done !
Moviepy - video ready final_video_with_caption.mp4
