In [2]:
import tempfile

import captacity
from moviepy.config import change_settings
import time
from moviepy.editor import VideoFileClip, CompositeVideoClip
import subprocess
import os

from captacity import create_shadow, transcriber
from captacity import segment_parser
from captacity.text_drawer import Word, create_text_ex, get_text_size_ex

change_settings({"IMAGEMAGICK_BINARY": r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe"})


def ffmpeg(command):
    return subprocess.run(command, capture_output=True)

def get_font_path(font):
    if os.path.exists(font):
        return font

    dirname = os.path.dirname(__file__)
    font = os.path.join(dirname, "assets", "fonts", font)

    if not os.path.exists(font):
        raise FileNotFoundError(f"Font '{font}' not found")

    return font

def detect_local_whisper(print_info):
    try:
        import whisper
        use_local_whisper = True
        if print_info:
            print("Using local whisper model...")
    except ImportError:
        use_local_whisper = False
        if print_info:
            print("Using OpenAI Whisper API...")

    return use_local_whisper
def calculate_lines(text, font, font_size, stroke_width, frame_width):
    global lines_cache

    arg_hash = hash((text, font, font_size, stroke_width, frame_width))

    if arg_hash in lines_cache:
        return lines_cache[arg_hash]

    lines = []

    line_to_draw = None
    line = ""
    words = text.split()
    word_index = 0
    total_height = 0
    while word_index < len(words):
        word = words[word_index]
        line += word + " "
        text_size = get_text_size_ex(line.strip(), font, font_size, stroke_width)
        text_width = text_size[0]
        line_height = text_size[1]

        if text_width < frame_width:
            line_to_draw = {
                "text": line.strip(),
                "height": line_height,
            }
            word_index += 1
        else:
            if not line_to_draw:
                print(f"NOTICE: Word '{line.strip()}' is too long for the frame!")
                line_to_draw = {
                    "text": line.strip(),
                    "height": line_height,
                }
                word_index += 1

            lines.append(line_to_draw)
            total_height += line_height
            line_to_draw = None
            line = ""

    if line_to_draw:
        lines.append(line_to_draw)
        total_height += line_height

    data = {
        "lines": lines,
        "height": total_height,
    }

    lines_cache[arg_hash] = data

    return data

def fits_frame(line_count, font, font_size, stroke_width, frame_width):
    def fit_function(text):
        lines = calculate_lines(
            text,
            font,
            font_size,
            stroke_width,
            frame_width
        )
        return len(lines["lines"]) <= line_count
    return fit_function


video_file="b.mp4",
output_file="my_short_with_captions1.mp4",
use_local_whisper=True,


output_file = "with_transcript.mp4",

font = "Bangers-Regular.ttf",
font_size = 130,
font_color = "yellow",

stroke_width = 3,
stroke_color = "black",

highlight_current_word = True,
word_highlight_color = "red",

line_count = 2,
fit_function = None,

padding = 50,
position = ("center", "center"), # TODO: Implement this

shadow_strength = 1.0,
shadow_blur = 0.1,

print_info = False,

initial_prompt = None,
segments = None,

use_local_whisper = "auto",

_start_time = time.time()

font = get_font_path(font)

if print_info:
    print("Extracting audio...")

temp_audio_file = tempfile.NamedTemporaryFile(suffix=".wav").name
ffmpeg([
    'ffmpeg',
    '-y',
    '-i', video_file,
    temp_audio_file
])

if segments is None:
    if print_info:
        print("Transcribing audio...")

    if use_local_whisper == "auto":
        use_local_whisper = detect_local_whisper(print_info)

    if use_local_whisper:
        segments = transcriber.transcribe_locally(temp_audio_file, initial_prompt)
    else:
        segments = transcriber.transcribe_with_api(temp_audio_file, initial_prompt)

if print_info:
    print("Generating video elements...")

# Open the video file
video = VideoFileClip(video_file)
text_bbox_width = video.w-padding*2
clips = [video]

captions = segment_parser.parse(
    segments=segments,
    fit_function=fit_function if fit_function else fits_frame(
        line_count,
        font,
        font_size,
        stroke_width,
        text_bbox_width,
    ),
)

for caption in captions:
    captions_to_draw = []
    if highlight_current_word:
        for i, word in enumerate(caption["words"]):
            if i+1 < len(caption["words"]):
                end = caption["words"][i+1]["start"]
            else:
                end = word["end"]

            captions_to_draw.append({
                "text": caption["text"],
                "start": word["start"],
                "end": end,
            })
    else:
        captions_to_draw.append(caption)

    for current_index, caption in enumerate(captions_to_draw):
        line_data = calculate_lines(caption["text"], font, font_size, stroke_width, text_bbox_width)

        text_y_offset = video.h // 2 - line_data["height"] // 2
        index = 0
        for line in line_data["lines"]:
            pos = ("center", text_y_offset)

            words = line["text"].split()
            word_list = []
            for w in words:
                word_obj = Word(w)
                if highlight_current_word and index == current_index:
                    word_obj.set_color(word_highlight_color)
                index += 1
                word_list.append(word_obj)

            # Create shadow
            shadow_left = shadow_strength
            while shadow_left >= 1:
                shadow_left -= 1
                shadow = create_shadow(line["text"], font_size, font, shadow_blur, opacity=1)
                shadow = shadow.set_start(caption["start"])
                shadow = shadow.set_duration(caption["end"] - caption["start"])
                shadow = shadow.set_position(pos)
                clips.append(shadow)

            if shadow_left > 0:
                shadow = create_shadow(line["text"], font_size, font, shadow_blur, opacity=shadow_left)
                shadow = shadow.set_start(caption["start"])
                shadow = shadow.set_duration(caption["end"] - caption["start"])
                shadow = shadow.set_position(pos)
                clips.append(shadow)

            # Create text
            text = create_text_ex(word_list, font_size, font_color, font, stroke_color=stroke_color, stroke_width=stroke_width)
            text = text.set_start(caption["start"])
            text = text.set_duration(caption["end"] - caption["start"])
            text = text.set_position(pos)
            clips.append(text)

            text_y_offset += line["height"]

end_time = time.time()
generation_time = end_time - _start_time

if print_info:
    print(f"Generated in {generation_time//60:02.0f}:{generation_time%60:02.0f} ({len(clips)} clips)")

if print_info:
    print("Rendering video...")

video_with_text = CompositeVideoClip(clips)

video_with_text.write_videofile(
    filename=output_file,
    codec="libx264",
    fps=video.fps,
    logger="bar" if print_info else None,
)

end_time = time.time()
total_time = end_time - _start_time
render_time = total_time - generation_time

if print_info:
    print(f"Generated in {generation_time//60:02.0f}:{generation_time%60:02.0f}")
    print(f"Rendered in {render_time//60:02.0f}:{render_time%60:02.0f}")
    print(f"Done in {total_time//60:02.0f}:{total_time%60:02.0f}")


TypeError: _path_exists: path should be string, bytes, os.PathLike or integer, not tuple