In [1]:
import asyncio

from app.core.config import app_config
from app.core.pipeline import pipeline
from app.utils.make_callback import make_callback


async def test():
    async def print_wrapper(message: str):
        print(message)

    print(app_config.model_dump_json(indent=2))

    topic = "Binary Trees"
    callback = make_callback(print_wrapper)
    await pipeline(callback=callback, topic=topic, voice="obama")

# full test run
# await test()


In [2]:
# from app.core.prompts
# edit prompt here

PLAN_PROMPT_INSTRUCTIONS = """
You are an expert video content creator specializing in creating engaging Fireship-style explainer videos.
"""

PLAN_PROMPT_TEMPLATE = """
Create a detailed plan for a 20-second Fireship-style explainer video about the following topic:
{{ topic }}

# GENERAL RULES
- There **must** be a total of 4 scenes, each 5 seconds long, for a total video duration of 20 seconds.
- Each scene **must** represent one cut.
- Narration should be punchy and opinionated, similar to Fireship videos.
- The video **must** be funny and reference popular internet culture or memes where appropriate.
- Select a main and secondary font to use for the entire video, for cohesiveness of the video. (This will be used during editing.) The fonts should be commonly available.
- DO NOT add any branding or mention Fireship in any way.
- DO NOT add any generic call to action (e.g., "like and subscribe", "follow for more").
- When executing the plan, we will have access to the manim animation tool.

# For each scene, provide the following details
1. Scene number (0-indexed)
2. Rough duration of the scene (in seconds)
3. Description of scene visuals
4. Narration/script for the scene
5. Description of any sound effects or background music
6. Edit notes (e.g., transitions, effects)
7. List of assets needed (Visual or Sound effect)
8. Scene structure (high-level sequence of events in the scene)

# RULES FOR SCENE STRUCTURE
- Scene structure should be a numbered list of the sequence of events in the scene.
- Only specify exact timings for when to start the scene voiceover. Each scene has a separate voiceover audio file.
- DO NOT specify exact timings for anything else in the scene structure.
Example scene structure:
```
1. Dark background initialization
2. Start scene voiceover immediately at t=0
3. Animate title text "LINKED LISTS" at the top
4. Sequentially introduce 4 linked list nodes from left to right:
    - Each node appears with a pop sound
    - Each node connects to the previous node via an arrow
5. Emphasize "TREASURE HUNT" with larger text and glow
6. End with a highlight traversal from first node to last
```

# RULES FOR VISUAL ASSETS
- When executing the plan, we will use the manim tool to generate visuals besides just using assets.
- Manim can generate animated graphs, text, shapes, and vector graphics.
- DO NOT list assets that will be generated using manim.
- DO NOT list manim as an asset.
- DO list visual assets that cannot be generated using manim, such as:
    - Meme or funny images or clips (e.g., surprised pikachu meme, distracted boyfriend meme, pepelaugh)
    - Stock images or clips (e.g., a person typing on a laptop, city skyline timelapse)
- Be liberal in listing visual assets that will enhance the video.
- Visual assets should appear on screen for at least 1 second to allow viewers to absorb them.

# RULES FOR ASSETS
- Asset descriptions should specify EXACTLY what the asset is.
    - GOOD description: "A dog eating a hot dog alone"
    - BAD description: "A dog eating something that seems tasty". This is too vague.
- Each asset description should only describe a SINGLE SPECIFIC asset. DO NOT give options.
    - GOOD description: "surprised pikachu meme". This describes a single specific asset.
    - BAD description: "loss meme or struggling person image". This has multiple options.
- DO NOT give examples in the asset descriptions.
    - BAD description: "A meme expressing frustation (e.g., person facepalming, character looking defeated)". This is an example, not a description.
- Each asset should include BOTH a short and a long description.
- Short description:
    - Short descriptions will be used to look up assets from the asset database.
    - Short description should be EXTREMELY short and generic and use less than 6 words. The asset database is limited and cannot handle very specific descriptions.
    - DO NOT include words like "image", "clip", "visual", "stock footage", or "sound effect" in the short description.
    - Examples of Short description (non-exhaustive): "Surprised pikachu meme", "Explosion sound effect"
- Long description:
    - Long description will be used as the prompt for the AI to generate the assets.
    - Long description should be more DETAILED and SPECIFIC than the short description.
    - Examples of Long description (non-exhaustive): "A dog eating a hot dog alone", "Loud explosion sound effect with deep bass"
- We will compare the assets generated (with long description) with the assets in the asset database (with short description) and select the best asset that matches both descriptions.

# RULES FOR NARRATION/SCRIPT
- Narration/script should be written in a conversational tone, as if explaining to a friend.
- Enhance the narration with humor, analogies, and relatable examples where appropriate to keep the audience engaged.
- Enhance the narration with internet culture references and memes where appropriate.
"""

In [3]:
import asyncio
import os
import uuid

from app.api.types import (
    FinalVideoResponse,
    PlanStreamedResponse,
    StartPipelineResponse,
)
from app.core.config import app_config
from app.core.generation.manim.generate_code import generate_manim_code_parallel, _generate_manim_code
from app.core.generation.manim.render_loop import render_manim_loop_parallel
from app.core.generation.manim.render_code import _render_manim_code
from app.core.generation.manim.stitch_scene import stitch_manim_scenes_together
from app.core.generation.plan import generate_plan
from app.core.generation.visuals import generate_visual_asset
from app.core.models import Asset, PipelineCallback, VideoPlan, VoiceType
from app.services.elevenlabs import generate_sound_effect, generate_speech
from app.utils.gather import gather_with_concurrency
from app.utils.get_file_path import (
    get_narration_scene_file_path,
    get_selected_asset_file_path,
)

topic = "Binary Trees"
voice = "obama"

async def callback(message: str, delay: float = 0):
    await asyncio.sleep(delay)
    print(message)

session_id = str(uuid.uuid4())

os.makedirs(f"static/{session_id}", exist_ok=True)
os.makedirs(f"working/{session_id}", exist_ok=True)

# Step 1: Generate video plan
await callback(StartPipelineResponse(session_id=session_id, success=True))

plan = await generate_plan(
    input_plan=None,
    topic=topic,
    streaming_delay=0,
    callback=callback,
    mock_plan=True,
    chars_per_stream_message=300000,
    plan_prompt_template=PLAN_PROMPT_TEMPLATE,
    plan_prompt_instructions=PLAN_PROMPT_INSTRUCTIONS,
)

await callback(PlanStreamedResponse(event_type="plan_end"))

print(f"\nGenerated Plan:\n{plan.model_dump_json(indent=2)}\n")

type='start' session_id='e15f8f7a-05e8-43f2-9a3e-fec794e03eef' success=True
type='plan' event_type='plan_stream' delta='{\n  "topic": "Binary Trees",\n  "main_font": "JetBrains Mono",\n  "secondary_font": "Helvetica",\n  "scenes": [\n    {\n      "scene_number": 0,\n      "duration_seconds": 5,\n      "visuals_description": "Dark background with a simple node appearing in center, then splitting into two child nodes, forming a basic tree structure. Text \'BINARY TREE\' appears at top. Nodes are clean circles with connecting lines. The tree structure glows slightly with a tech aesthetic.",\n      "narration_script": "Binary trees. It\'s literally just a node that can have at most two children. Left child, right child. That\'s it. That\'s the whole concept.",\n      "sound_description": "Soft electronic whoosh as nodes appear. Subtle pop sound for each node connection. Low ambient tech music begins.",\n      "edit_notes": "Quick fade in from black. Nodes appear with smooth animation. Text

In [4]:
# Generate assets and narration

# Step 2: Generate narration for each scene
print("\nGenerating narration...\n")

scripts: list[str] = [scene.narration_script for scene in plan.scenes]

# Generate narrations in parallel
# Elevenlabs API has concurrency limits of 5
# We're on creator tier, so 5 concurrent requests allowed for sound effects
# 10 for narration cos flash/turbo model (using flash)
# https://help.elevenlabs.io/hc/en-us/articles/14312733311761-How-many-requests-can-I-make-and-can-I-increase-it
word_timings = await gather_with_concurrency(
    5,
    *[
        generate_speech(
            text=narration,
            voice=voice,
            file_name=get_narration_scene_file_path(
                session_id=session_id, scene_number=i
            ),
            callback=callback,
            scene_number=i,
        )
        for i, narration in enumerate(scripts)
    ],
)

print(f"\nNarration_results:\n{word_timings}\n")

# Step 3: Generate assets and manim code for each scene
print("\nGenerating assets and manim code...\n")

# split assets into sound effects and visual assets,
# because sound effects have a concurrency limit of 5 on elevenlabs
visual_assets: list[Asset] = [
    asset
    for scene in plan.scenes
    for asset in scene.assets_needed
    if asset.asset_type == "visual"
]
sound_effects: list[Asset] = [
    asset
    for scene in plan.scenes
    for asset in scene.assets_needed
    if asset.asset_type == "sound_effect"
]

# Max 5 concurrent requests for sound effects
gather_sound_effects = gather_with_concurrency(
    5,
    *[
        generate_sound_effect(
            description=asset.asset_long_desc,
            file_name=get_selected_asset_file_path(
                session_id=session_id, asset_id=asset.asset_id, ext="mp3"
            ),
            callback=callback,
            asset_id=asset.asset_id,
        )
        for asset in sound_effects
    ],
)

gather_visual_assets = asyncio.gather(
    *[
        generate_visual_asset(asset=asset, session_id=session_id, callback=callback)
        for asset in visual_assets
    ]
)

await asyncio.gather(
    gather_sound_effects,
    gather_visual_assets,
)


Generating narration...

type='narration' event_type='narration_generation_start' scene_number=0
type='narration' event_type='narration_generation_start' scene_number=1
type='narration' event_type='narration_generation_start' scene_number=2
type='narration' event_type='narration_generation_start' scene_number=3
type='narration' event_type='narration_generation_end' scene_number=0
type='narration' event_type='narration_generation_end' scene_number=1
type='narration' event_type='narration_generation_end' scene_number=2
type='narration' event_type='narration_generation_end' scene_number=3

Narration_results:
['{"words":["Binary","trees.","It\'s","literally","just","a","node","that","can","have","at","most","two","children.","Left","child,","right","child.","That\'s","it.","That\'s","the","whole","concept."],"word_start_times_seconds":[0.0,0.35,0.7000000000000001,0.9500000000000003,1.4500000000000006,1.7000000000000008,1.800000000000001,2.0500000000000007,2.3,2.499999999999999,2.749999999

[[None], [None, None]]

In [5]:
# from app.core.prompts
# edit prompt here

MANIM_PROMPT_INSTRUCTIONS = """
You are an expert in creating engaging "Fireship-style" explainer videos using the Manim Community v0.19.1 library.
Your task is to generate Manim code for a specific scene of the video, following the detailed requirements and inputs provided.
"""

# Seperating subtitle and voiceover logic gave worse results - need to ask LLM for delay before starting captions and voiceover
# Inserting video - https://github.com/3b1b/manim/issues/760#issuecomment-925659697
#
# PROBLEMS WITH MANIM PROMPT:
# Sometimes only a part of a single word is highlighted in captions.
# E.g. see mock scene output - "ki*ds*"
# Tokenization issue? Just tell it to highlight full words
# and not (never? unless there is a good reason)
# only partial words/certain characters in a word
#
MANIM_PROMPT_TEMPLATE = """
# OBJECTIVE
Generate **Manim Community v0.19.1** Python code for **Scene {{ scene_number }}** of a "Fireship-style" explainer video about **{{ topic }}**.

# SCENE {{ scene_number }} STRUCTURE
The scene **must** follow this exact high-level sequence:

{{ scene_structure }}

# OUTPUT SPECIFICATIONS
- **Class Name:** Define the main class as `Scene{{ scene_number }}`.
- **Format:** Output **only** the raw Python code. Do not use Markdown formatting, backticks, or code blocks.
- **Libraries Allowed:** You have access to `manim`, `cv2`, and `numpy` only.

# STYLE & ANIMATION GUIDELINES
1.  **Fireship Style:** The scene **must** be fast-paced, engaging, funny, and high-energy.
2.  **No Branding:** Do not mention Fireship or add any branding.
3.  **No Call to Action:** Do not include generic calls to action (e.g., "like and subscribe", "follow for more").
3.  **Visuals:** Use the main and secondary fonts provided in the plan. You may use other common fonts if they fit the style.
4.  **Boundaries:** Ensure all visual elements, including animated captions, images, and text, remain strictly within the frame boundaries. No element should be clipped by the edges of the screen.
5.  **Colors**: Do not use standard Manim color constants (e.g., RED, GREEN). You **must** strictly define color constants with this exact format: `NEW_COLOR = ManimColor.from_rgb((R, G, B), alpha=1.0)`. DO NOT define colors in any other way.
6.  **Captions:**
    *   Include animated captions for the voiceover.
    *   Captions **must** be semantic phrases, not individual words. DO NOT create one caption per word.
    *   Each caption **must** span the combined duration of its associated words.
    *   Captions **must** appear and disappear within 0.2s of the voiceover timing
    *   **Style:** Captions **must** have different styles (e.g., color, size, font weight) to emphasize key points, humor, or punchlines in the narration.
    *   **Placement:** Ensure captions do not overlap with other essential visual elements and stay within frame boundaries.
7.  **Synchronization:** Strictly align transitions, animations, effects, text, and sound effects with the provided `word_start_times_seconds` and `word_end_times_seconds`. External visual assets **must** appear on screen for at least 1 second.

# ASSET HANDLING SPECIFICATIONS

## 1. File Paths
Assets are located in `static/{{ session_id }}/`.
*   **Visuals:** `{asset_id}.mp4` (includes static images converted to single-frame mp4s).
*   **Audio (SFX):** `{asset_id}.mp3`.
*   **Voiceover:** `narration_scene_{{ scene_number }}.mp3` (located in the same folder).

The assets for this scene are as follows:
{{ scene_assets }}

## 2. Audio Implementation
Use `self.add_sound("path/to/file.mp3")` for both the voiceover and sound effects.

## 3. Visual Implementation (Custom CV2 Logic)
All visual assets are `.mp4` files. Some are single-frame loops (images), others are video clips.
*   **Scaling:** Resize frames to fit the scene using `frame_img.scale_to_fit_height()` or `.scale_to_fit_width()`. (Manim default: 14.22w x 8h).
*   **Aspect Ratio:** Assume roughly square aspect ratios.
*   **Minimum Duration**: Any external .mp4 asset **must** remain visible for a MINIMUM of 1 second. Do not cut them shorter than this, even if the voiceover is fast.
*   **Logic:** If there are visual assets, you **must** adapt the following `cv2` pattern to display these assets, ensuring you loop or cut them to fit the scene duration:

```python
cap = cv2.VideoCapture("path to mp4 asset")
visual_asset_duration = 1.0 # minimum duration for the visual asset in seconds (**must** be at least 1 second)
frame_time = 0.04
elapsed = 0

current_frame = None
while elapsed < visual_asset_duration:
    flag, frame = cap.read()

    if not flag:
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        flag, frame = cap.read()

    if flag:
        if current_frame is not None:
            self.remove(current_frame)

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_img = ImageMobject(frame).scale_to_fit_height(5)
        frame_img.move_to(UP * 0.5)
        current_frame = frame_img
        self.add(frame_img)
        self.wait(frame_time)
        elapsed += frame_time
    else:
        self.wait(frame_time)
        elapsed += frame_time

cap.release()
```

# TECHNICAL CONSTRAINTS
You **must** strictly adhere to these rules to prevent runtime errors:
1.  **Group vs. VGroup:** `ImageMobject` is **NOT** a `VMobject`.
    *   **Never** add an `ImageMobject` to a `VGroup`.
    *   If you need to group images (or mix images with vectors/text), you **must** use `Group()` instead of `VGroup()`.
2.  **Numpy Types in Text:** If numpy strings used, you **must** explicitly cast them to a Python string before passing to Manim objects.
3.  **Code Blocks:** If code snippets are required, you **must** strictly instantiate the `Code` class using the following signature (filling in `code_string` and `language` as needed):
    ```python
    Code(
        code_string="...",
        language="...",
        formatter_style='vim',
        tab_width=4,
        add_line_numbers=True,
        line_numbers_from=1,
        background='rectangle',
        background_config=None,
        paragraph_config=None
    )
    ```
4.  **Minimum Wait Time:** You **must** ensure that the wait duration in `self.wait(duration)` is positive, by enforcing a minimum wait time of 0.01s for any calls with `self.wait(max(0.01, duration))`.


# INPUT DATA

## 1. Full Video Plan
Use this plan for context on flow and edit notes. **Only generate code for Scene {{ scene_number }}.**

{{ full_plan }}

## 2. Voiceover Script (Scene {{ scene_number }})
Use this text for caption content.

{{ full_script }}

## 3. Timing Data
Use the lists below for precise alignment. Times are in seconds relative to the start of this scene.
*   `word_start_times_seconds`: Start time of each word.
*   `word_end_times_seconds`: End time of each word.

{{ word_timings }}

# TIMING SPECIFICATIONS
*   **Source of Truth:** Use the provided `word_start_times_seconds` and `word_end_times_seconds` lists to align animations and captions with the voiceover.
*   **Timings:** Times are relative to the start of this scene (scene {{ scene_number }}).
*   **Precision:** Animation start times should be within ±0.1s of the narration timing.
*   **Captions:** Captions may slightly overlap transitions for readability, but they should appear and disappear within ±0.2s of the voiceover timing.
*   **Visuals:** All imported visual assets **must** appear on screen for at least 1 second to allow viewers to absorb them.
*   **Native Manim Elements:**  Other Manim objects (shapes, text, code blocks, flashes) are NOT subject to the 1-second rule. These should be fast, transient, and match the high-energy "Fireship" pacing (e.g., appearing for only 0.8s is acceptable).
"""

In [6]:
# Generate code for just one scene, one version
mock_code_generation = True
scene_number = 0
version_number = 0
code = await _generate_manim_code(
    scene_number=scene_number,
    full_script=scripts[scene_number],
    word_timings=word_timings[scene_number],
    session_id=session_id,
    plan=plan,
    callback=callback,
    version_number=version_number,
    code_prompt_instructions=MANIM_PROMPT_INSTRUCTIONS,
    code_prompt_template=MANIM_PROMPT_TEMPLATE,
    mock=mock_code_generation,
)

# Print the generated code
print(code.code)


# Or for all scenes sequentially, but each scene's versions in parallel
# for i in range(len(plan.scenes)):
#     await generate_manim_code_parallel(
#         scene_number=i,
#         full_script=scripts[i],
#         word_timings=word_timings[i],
#         session_id=session_id,
#         plan=plan,
#         callback=callback,
#         num_code_versions=app_config.NUM_CODE_VERSIONS_PER_SCENE,
#         code_prompt_instructions=MANIM_PROMPT_INSTRUCTIONS,
#         code_prompt_template=MANIM_PROMPT_TEMPLATE,
#         mock=mock_code_generation,
#     )


# Or for all scenes and versions in parallel
# gather_manim_code = asyncio.gather(
#     *[
#         generate_manim_code_parallel(
#             scene_number=i,
#             full_script=scripts[i],
#             word_timings=word_timings[i],
#             session_id=session_id,
#             plan=plan,
#             callback=callback,
#             num_code_versions=app_config.NUM_CODE_VERSIONS_PER_SCENE,
#             code_prompt_instructions=MANIM_PROMPT_INSTRUCTIONS,
#             code_prompt_template=MANIM_PROMPT_TEMPLATE,
#             mock=mock_code_generation,
#         )
#         for i in range(len(plan.scenes))
#     ]
# )
# await gather_manim_code

type='code_generation' event_type='code_generation_start' scene_number=0 version_number=0 retry_number=0 success=None
type='code_generation' event_type='code_generation_end' scene_number=0 version_number=0 retry_number=0 success=True
# type: ignore
from manim import *
import cv2
import numpy as np

# Color definitions
DARK_BG = ManimColor.from_rgb((18, 18, 24), alpha=1.0)
NODE_COLOR = ManimColor.from_rgb((100, 200, 255), alpha=1.0)
LINE_COLOR = ManimColor.from_rgb((150, 150, 180), alpha=1.0)
TEXT_WHITE = ManimColor.from_rgb((240, 240, 240), alpha=1.0)
TEXT_YELLOW = ManimColor.from_rgb((255, 220, 100), alpha=1.0)
TEXT_RED = ManimColor.from_rgb((255, 100, 100), alpha=1.0)
TEXT_GREEN = ManimColor.from_rgb((100, 255, 150), alpha=1.0)
CHAOS_COLOR = ManimColor.from_rgb((255, 80, 80), alpha=1.0)
HIGHLIGHT_COLOR = ManimColor.from_rgb((255, 200, 50), alpha=1.0)

class Scene0(Scene):
    def construct(self):
        self.camera.background_color = DARK_BG

        # Add voiceover at t=0
        s

In [None]:
# test rendering just one scene, one version
mock_render = False

video_file_path, error_message = await _render_manim_code(
    session_id=session_id,
    scene_number=scene_number,
    version_number=version_number,
    mock=mock_render,
)

print(error_message)

Rendering scene 0, task 0...!
Manim Community [32mv0.[0m[32m19.1[0m

[2;36m                    [0m         used: [1m{[0m[32m'shortest'[0m: [32m'1'[0m,    [2m                        [0m
[2;36m                    [0m         [32m'metadata'[0m:                [2m                        [0m
[2;36m                    [0m         [32m'[0m[32mcomment[0m[32m=[0m[1;33mRendered[0m[32m with [0m    [2m                        [0m
[2;36m                    [0m         [32mManim Community v0.19.1'[0m[1m}[0m  [2m                        [0m
                                                                                                          
Successfully rendered scene 0, task 0
Rendered scene 0, task 0 to static/e15f8f7a-05e8-43f2-9a3e-fec794e03eef/scene_0.mp4


In [8]:
# play scene rendered
from IPython.display import display, Video

display(Video(data=video_file_path, width=400, height=300))