<a href="https://colab.research.google.com/github/itay-ct/storyvideo/blob/main/StoryVideo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================
# Cell 1: Install dependencies (Colab)
# ============================================

!pip install -q google-generativeai ffmpeg-python python-dotenv pydub

# FFmpeg is usually preinstalled on Colab, but let's ensure it's available
!apt-get -y install ffmpeg > /dev/null

In [2]:
# ============================================
# Cell 2: Imports & basic setup
# ============================================

import os
import json
import tempfile
from datetime import timedelta

import google.generativeai as genai
from pydub import AudioSegment
import ffmpeg

from google.colab import files
from getpass import getpass

In [3]:
# ============================================
# Cell 3: Configure Gemini API key (user input)
# ============================================

print("Please enter your Gemini API key (it will be hidden):")
GEMINI_API_KEY = getpass("API key: ")

if not GEMINI_API_KEY:
    raise ValueError("You must provide a Gemini API key.")

genai.configure(api_key=GEMINI_API_KEY)

print("Gemini configured.")

Please enter your Gemini API key (it will be hidden):
API key: ··········
Gemini configured.


In [4]:
# ============================================
# Cell 4: Upload Hebrew MP3 file
# ============================================

print("Upload your Hebrew MP3 story file:")
uploaded = files.upload()

if not uploaded:
    raise ValueError("No file uploaded.")

# Take the first uploaded file
uploaded_filename = list(uploaded.keys())[0]
audio_path = uploaded_filename

print(f"Using audio file: {audio_path}")

Upload your Hebrew MP3 story file:


Saving WhatsApp Image 2025-12-09 at 20.10.23.mp3 to WhatsApp Image 2025-12-09 at 20.10.23 (1).mp3
Using audio file: WhatsApp Image 2025-12-09 at 20.10.23 (1).mp3


In [5]:
# ============================================
# Cell 5: Helper - load audio and inspect duration
# ============================================

audio = AudioSegment.from_file(audio_path)
audio_duration_sec = len(audio) / 1000.0

print(f"Audio duration: {audio_duration_sec:.2f} seconds")

Audio duration: 82.16 seconds


In [6]:
# ============================================
# Cell 6: Transcribe Hebrew audio with Gemini
# ============================================
# Using a real model from your list: models/gemini-2.5-flash
# ============================================

TRANSCRIBE_MODEL = "models/gemini-2.5-flash"  # from your model list

print("Using transcription model:", TRANSCRIBE_MODEL)

with open(audio_path, "rb") as f:
    audio_bytes = f.read()

audio_part = {
    "mime_type": "audio/mpeg",   # "audio/mp3" also usually works
    "data": audio_bytes
}

prompt_transcribe = (
    "Transcribe the Hebrew audio exactly as spoken, in Hebrew. "
    "Output only the transcript text, without any explanations."
)

model = genai.GenerativeModel(TRANSCRIBE_MODEL)
response = model.generate_content(
    [prompt_transcribe, audio_part]
)

hebrew_transcript = response.text.strip()
print("=== Hebrew Transcript (preview) ===")
print(hebrew_transcript[:1000], "..." if len(hebrew_transcript) > 1000 else "")

Using transcription model: models/gemini-2.5-flash
=== Hebrew Transcript (preview) ===
תצייר לי תמונה של אמ נו, שדה עם עץ קטן ש נו, מעלף יש חדקר מופפים ו יש קשת בענן יפה וכוכבים ומתחת לעץ יש את כל החיות שיש בכל העולם ו ו נו, שתי אחד אבא ושתי ילדה ועוד משפחה שיושבת בצד. ושניהם עושים פיקניק עם מלא מלא סוכריות ומלא מלא מלא מלא מלא מלא אוכל. בעיקר שוקולדים. והם מלטפים כלבים וחתולים ואת כל החיות שיש בעולם ומה שהם מלטפים זה כלבים ו תצייר לי תמונה גם אמ ו גם תעשה לזה גשם של מלא ממתקים, גשם של ממתקים, תצייר לי תמונה כזאת. 


In [7]:
# ============================================
# Cell 7: Split transcript into story segments
# ============================================
# We ask Gemini to break the Hebrew text into visual story beats.
# It returns a JSON array of { "id": number, "text": "..." }.
# ============================================

SEGMENTATION_MODEL = "gemini-2.5-flash"  # can be same as above

prompt_segment = f"""
You are splitting children's story text in Hebrew into visual beats.

Text (Hebrew):
\"\"\"{hebrew_transcript}\"\"\"

Instructions:
- Break this text into segments, where each segment corresponds to a single visual scene change or addition.
- Each segment should be at most 1–2 sentences.
- Preserve the original Hebrew wording as much as possible.
- Return a valid JSON array, no additional text, in the form:
  [
    {{ "id": 1, "text": "..." }},
    {{ "id": 2, "text": "..." }},
    ...
  ]
"""

seg_model = genai.GenerativeModel(SEGMENTATION_MODEL)
seg_resp = seg_model.generate_content(prompt_segment)

raw_segments_json = seg_resp.text.strip()
print("=== Raw segments JSON (preview) ===")
print(raw_segments_json[:1000], "..." if len(raw_segments_json) > 1000 else "")

# Try to parse JSON
try:
    segments = json.loads(raw_segments_json)
except json.JSONDecodeError as e:
    print("JSON decode error:", e)
    print("Trying to fix common issues...")
    # Very naive fallback: try to locate the first '[' and last ']'
    start_idx = raw_segments_json.find('[')
    end_idx = raw_segments_json.rfind(']')
    if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
        fixed = raw_segments_json[start_idx:end_idx+1]
        segments = json.loads(fixed)
    else:
        raise ValueError("Could not parse segments JSON. Please inspect the segmentation output.")

print(f"\nParsed {len(segments)} segments.")
for s in segments[:5]:
    print(f"Segment {s['id']}: {s['text']}")
if len(segments) > 5:
    print("...")

=== Raw segments JSON (preview) ===
```json
[
  {
    "id": 1,
    "text": "תצייר לי תמונה של אמ נו, שדה עם עץ קטן ש נו, מעלף."
  },
  {
    "id": 2,
    "text": "יש חדקר מופפים ו יש קשת בענן יפה וכוכבים."
  },
  {
    "id": 3,
    "text": "ומתחת לעץ יש את כל החיות שיש בכל העולם."
  },
  {
    "id": 4,
    "text": "ו ו נו, שתי אחד אבא ושתי ילדה ועוד משפחה שיושבת בצד."
  },
  {
    "id": 5,
    "text": "ושניהם עושים פיקניק עם מלא מלא סוכריות ומלא מלא מלא מלא מלא מלא אוכל. בעיקר שוקולדים."
  },
  {
    "id": 6,
    "text": "והם מלטפים כלבים וחתולים ואת כל החיות שיש בעולם ומה שהם מלטפים זה כלבים ו."
  },
  {
    "id": 7,
    "text": "גם תעשה לזה גשם של מלא ממתקים, גשם של ממתקים."
  }
]
``` 
JSON decode error: Expecting value: line 1 column 1 (char 0)
Trying to fix common issues...

Parsed 7 segments.
Segment 1: תצייר לי תמונה של אמ נו, שדה עם עץ קטן ש נו, מעלף.
Segment 2: יש חדקר מופפים ו יש קשת בענן יפה וכוכבים.
Segment 3: ומתחת לעץ יש את כל החיות שיש בכל העולם.
Segment 4: ו ו נו, שתי אח

In [8]:
# ============================================
# Cell 8 (INCREMENTAL DELTAS): Generate English prompts
# ============================================
# For segment 1:
#   - Get a full-scene English description.
# For segments 2..N:
#   - Ask Gemini: "Given previous Hebrew text and the new segment,
#     describe ONLY the new visual additions/changes in English."
# These "delta" prompts will then be used in the image step as
# incremental additions to the previous image.
# ============================================

PROMPT_MODEL = "gemini-2.5-flash"

prompt_model = genai.GenerativeModel(PROMPT_MODEL)
image_prompts = []

# We'll accumulate Hebrew context as we go
previous_hebrew_context = ""

# --- Templates ---

# First segment: full initial scene
prompt_template_first_segment = """
You create prompts for a children's story illustration.
The story text is in Hebrew. Your job:
- Understand the Hebrew text.
- Describe the initial visual scene in English, in one concise sentence.
- Include concrete visual details (characters, setting, mood).
- Style: soft, colorful children's book illustration.
- This is the FIRST scene, so you should describe the whole scene.
- Output only the English description sentence, no quotes.

Hebrew segment:
\"\"\"{hebrew_segment}\"\"\"
"""

# Subsequent segments: ONLY new additions/changes
prompt_template_delta_segment = """
You create prompts for a children's story illustration.
The story text is in Hebrew.

You are given:
1) The previous story context in Hebrew (what has happened so far).
2) The new segment in Hebrew (what happens now).

Your job:
- Compare the new segment to the previous context.
- Identify ONLY the NEW visual elements or CHANGES that should appear,
  relative to what was already visible.
- Do NOT re-describe the entire scene.
- Do NOT mention things that are already in the scene unless they are changing.
- Focus on additions and modifications that the illustrator should make.
- Output a short English instruction sentence like:
  "Add ...", "Now show ...", "Make the sky ...", etc.
- Style: soft, colorful children's book illustration.
- Output only the English instruction sentence, no quotes.

Previous Hebrew story context:
\"\"\"{previous_context}\"\"\"

New Hebrew segment:
\"\"\"{hebrew_segment}\"\"\"
"""

for idx, s in enumerate(segments):
    heb_seg = s["text"]

    # First segment: full scene description
    if idx == 0:
        prompt_text = prompt_template_first_segment.format(
            hebrew_segment=heb_seg
        )
    else:
        prompt_text = prompt_template_delta_segment.format(
            previous_context=previous_hebrew_context,
            hebrew_segment=heb_seg
        )

    resp = prompt_model.generate_content(prompt_text)
    english_prompt = resp.text.strip().replace("\n", " ")

    print(f"Segment {s['id']} (Hebrew): {heb_seg}")
    print(f"  => Prompt: {english_prompt}\n")

    image_prompts.append({
        "id": s["id"],
        "hebrew": heb_seg,
        "prompt": english_prompt,
        "is_delta": (idx != 0)
    })

    # Update context for next steps
    previous_hebrew_context += "\n" + heb_seg

print(f"Generated {len(image_prompts)} English prompts (first = full scene, rest = deltas).")

Segment 1 (Hebrew): תצייר לי תמונה של אמ נו, שדה עם עץ קטן ש נו, מעלף.
  => Prompt: A soft and colorful children's book illustration of a stunning green field featuring a charming small tree standing alone under a bright, inviting sky.

Segment 2 (Hebrew): יש חדקר מופפים ו יש קשת בענן יפה וכוכבים.
  => Prompt: Add flying unicorns, a beautiful rainbow, and stars.

Segment 3 (Hebrew): ומתחת לעץ יש את כל החיות שיש בכל העולם.
  => Prompt: Add many different animals from all over the world, gathered under the tree.

Segment 4 (Hebrew): ו ו נו, שתי אחד אבא ושתי ילדה ועוד משפחה שיושבת בצד.
  => Prompt: Add a father and two children to the scene, and another family sitting to the side in the field.

Segment 5 (Hebrew): ושניהם עושים פיקניק עם מלא מלא סוכריות ומלא מלא מלא מלא מלא מלא אוכל. בעיקר שוקולדים.
  => Prompt: Add a picnic spread with many candies, food, and chocolates for the father and two children.

Segment 6 (Hebrew): והם מלטפים כלבים וחתולים ואת כל החיות שיש בעולם ומה שהם מלטפים זה 

In [9]:
# ============================================
# Cell 9 (INCREMENTAL): Generate images with nano-banana-pro-preview
# ============================================
# We use genai.GenerativeModel with the Nano Banana model name and
# extract image bytes from candidate.content.parts[].inline_data.data
# For segment 1: text-only generation.
# For segments 2..N: previous image + "addition" prompt (incremental).
# ============================================

# Use your Nano Banana model name
NANO_MODEL_NAME = "nano-banana-pro-preview"  # change if your project uses a different id

img_model = genai.GenerativeModel(NANO_MODEL_NAME)

# Create a temporary directory for images
image_dir = tempfile.mkdtemp(prefix="story_images_")
print("Saving generated images to:", image_dir)

generated_image_paths = []


def extract_image_bytes_from_result(result, seg_id):
    """
    Try to extract image bytes from a generate_content result.

    We look through all candidates and their content.parts for inline_data
    with an image mime_type, and return the first one we find.
    """
    if not hasattr(result, "candidates") or not result.candidates:
        raise ValueError(f"No candidates in image result for segment {seg_id}")

    for cand in result.candidates:
        content = getattr(cand, "content", None)
        if not content or not getattr(content, "parts", None):
            continue

        for part in content.parts:
            inline_data = getattr(part, "inline_data", None)
            if inline_data and getattr(inline_data, "data", None):
                mime = getattr(inline_data, "mime_type", "")
                # Accept any image mime type
                if mime.startswith("image/"):
                    return inline_data.data

    raise ValueError(
        f"Could not find inline image data in result for segment {seg_id}. "
        "Inspect `result` structure to adapt the extractor."
    )


previous_image_path = None  # will hold the path of the last generated image

for idx, item in enumerate(image_prompts):
    seg_id = item["id"]
    prompt_text = item["prompt"]

    print(f"Generating image for segment {seg_id}...")

    # --------------------------------------------------------
    # 1st segment: generate from scratch, text-only
    # --------------------------------------------------------
    if previous_image_path is None:
        # You can keep this as-is or prepend style guidance
        base_instruction = (
            "Create a soft, colorful children's book illustration. "
            "Scene: "
        )
        result = img_model.generate_content(
            [base_instruction + prompt_text]
        )

    # --------------------------------------------------------
    # Subsequent segments: incremental change/addition
    # --------------------------------------------------------
    else:
        # Load previous image bytes
        with open(previous_image_path, "rb") as f:
            prev_img_bytes = f.read()

        # We ask the model to *modify* the previous image
        # while keeping everything consistent and only adding/changing
        # what this new segment describes.
        base_instruction = (
            "You are modifying the previous image. "
            "Keep the same characters, style, colors, and composition as much as possible. "
            "Only add or adjust details so the scene matches this new story moment. "
        )

        # We construct a multimodal prompt:
        #   text -> previous image -> text
        # So the model sees the old scene + the requested change.
        prompt_parts = [
            {"text": base_instruction},
            {
                "inline_data": {
                    "mime_type": "image/png",  # previous images saved as PNG
                    "data": prev_img_bytes,
                }
            },
            {
                "text": (
                    "Now update the scene according to this description: "
                    + prompt_text
                )
            },
        ]

        result = img_model.generate_content(prompt_parts)

    # --------------------------------------------------------
    # Extract image bytes and save
    # --------------------------------------------------------
    image_bytes = extract_image_bytes_from_result(result, seg_id)

    img_filename = os.path.join(image_dir, f"segment_{seg_id:03d}.png")
    with open(img_filename, "wb") as f:
        f.write(image_bytes)

    print(f"Saved: {img_filename}")
    generated_image_paths.append(img_filename)

    # Set for next iteration
    previous_image_path = img_filename

print("Incremental image generation complete.")

Saving generated images to: /tmp/story_images_e5jmah0b
Generating image for segment 1...
Saved: /tmp/story_images_e5jmah0b/segment_001.png
Generating image for segment 2...
Saved: /tmp/story_images_e5jmah0b/segment_002.png
Generating image for segment 3...
Saved: /tmp/story_images_e5jmah0b/segment_003.png
Generating image for segment 4...
Saved: /tmp/story_images_e5jmah0b/segment_004.png
Generating image for segment 5...
Saved: /tmp/story_images_e5jmah0b/segment_005.png
Generating image for segment 6...
Saved: /tmp/story_images_e5jmah0b/segment_006.png
Generating image for segment 7...
Saved: /tmp/story_images_e5jmah0b/segment_007.png
Incremental image generation complete.


In [10]:
# ============================================
# Cell 10: Define durations for each segment
# ============================================
# Simplest approach: divide total audio duration evenly among segments.
# ============================================

num_segments = len(segments)
if num_segments == 0:
    raise ValueError("No segments found.")

duration_per_segment = audio_duration_sec / num_segments
print(f"Duration per segment: {duration_per_segment:.2f} seconds")

segment_timeline = []
for i, s in enumerate(segments):
    start = i * duration_per_segment
    end = (i + 1) * duration_per_segment
    segment_timeline.append({
        "id": s["id"],
        "image_path": generated_image_paths[i],
        "start": start,
        "end": end,
        "duration": end - start
    })

segment_timeline[:3]

Duration per segment: 11.74 seconds


[{'id': 1,
  'image_path': '/tmp/story_images_e5jmah0b/segment_001.png',
  'start': 0.0,
  'end': 11.736428571428572,
  'duration': 11.736428571428572},
 {'id': 2,
  'image_path': '/tmp/story_images_e5jmah0b/segment_002.png',
  'start': 11.736428571428572,
  'end': 23.472857142857144,
  'duration': 11.736428571428572},
 {'id': 3,
  'image_path': '/tmp/story_images_e5jmah0b/segment_003.png',
  'start': 23.472857142857144,
  'end': 35.20928571428571,
  'duration': 11.736428571428569}]

In [11]:
# ============================================
# Cell 11 (FIXED): Build the video from images + audio using ffmpeg
# ============================================
# Strategy:
#  1) For each segment/image, create a small MP4 clip with the correct duration,
#     forcing SAR=1 and yuv420p.
#  2) Use ffmpeg concat demuxer to join all clips into temp_video_no_audio.mp4.
#  3) Merge that video with the original audio.
# ============================================

import uuid

output_video_path = "story_video.mp4"

print("Building video from images and audio...")

# 1) Create per-segment video clips
clips_dir = tempfile.mkdtemp(prefix="story_clips_")
print("Temporary clips directory:", clips_dir)

clip_paths = []

for i, seg in enumerate(segment_timeline):
    img_path = seg["image_path"]
    dur = seg["duration"]
    clip_path = os.path.join(clips_dir, f"clip_{i:03d}.mp4")
    clip_paths.append(clip_path)

    # -loop 1: loop still image
    # -t dur: set duration
    # -vf setsar=1:1,scale= to normalize SAR & ensure consistent video params
    # -pix_fmt yuv420p: compatible pixel format
    cmd = f'''
ffmpeg -y -loop 1 -i "{img_path}" -t {dur:.3f} \
-vf "setsar=1:1,scale=1408:768" \
-c:v libx264 -pix_fmt yuv420p -r 25 "{clip_path}"
    '''
    print(f"Creating clip {i} from {img_path} ({dur:.2f}s)...")
    !bash -c '{cmd}'
    print(f"  -> {clip_path}")

# 2) Create concat list file
concat_list_path = os.path.join(clips_dir, "clips.txt")
with open(concat_list_path, "w") as f:
    for cp in clip_paths:
        f.write(f"file '{cp}'\n")

print("Concat list file:", concat_list_path)
with open(concat_list_path) as f:
    print("=== clips.txt ===")
    print(f.read())

# 3) Concatenate clips into a single video (no audio)
temp_video_no_audio = "temp_video_no_audio.mp4"
concat_cmd = f'''
ffmpeg -y -f concat -safe 0 -i "{concat_list_path}" \
-c copy "{temp_video_no_audio}"
'''
print("Running ffmpeg concat...")
print(concat_cmd)
!bash -c '{concat_cmd}'

# 4) Merge concatenated video with original audio
print("Merging video with original audio...")

merge_cmd = f'''
ffmpeg -y -i "{temp_video_no_audio}" -i "{audio_path}" \
-c:v copy -c:a aac -shortest "{output_video_path}"
'''
print(merge_cmd)
!bash -c '{merge_cmd}'

print(f"Video created: {output_video_path}")

Building video from images and audio...
Temporary clips directory: /tmp/story_clips_jot2hvvm
Creating clip 0 from /tmp/story_images_e5jmah0b/segment_001.png (11.74s)...
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnapp

In [12]:
# ============================================
# Cell 12: Download the final video
# ============================================

files.download("story_video.mp4")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>