In [1]:
!pip install openai-whisper moviepy pillow diffusers transformers torch accelerate safetensors


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.1/253.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-any.whl size

In [6]:
import whisper
from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips
from PIL import Image
from diffusers import StableDiffusionPipeline
import torch
import os

# ========== SETUP MODELS ==========

print("Loading Whisper model...")
whisper_model = whisper.load_model("base")

print("Loading Stable Diffusion...")
pipe = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    safety_checker=None
).to("cuda" if torch.cuda.is_available() else "cpu")

# ========== FUNCTIONS ==========

def transcribe_audio(audio_path):
    print(f"Transcribing: {audio_path}")
    result = whisper_model.transcribe(audio_path)
    return result["segments"]

def get_visual_for_text(prompt, idx):
    short_prompt = "cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, " + prompt.strip().split('.')[0][:100]
    print(f"[Gen] Creating image for: {short_prompt}")
    image = pipe(short_prompt).images[0]
    path = f"frame_{idx}.jpg"
    image.save(path)
    return path

def create_video(audio_path, segments):
    audio_clip = AudioFileClip(audio_path)
    visuals = []

    for idx, seg in enumerate(segments):
        print(f"[{idx+1}/{len(segments)}] {seg['text']}")
        img_path = get_visual_for_text(seg['text'], idx)
        duration = seg['end'] - seg['start']
        clip = ImageClip(img_path).set_duration(duration)
        visuals.append(clip)

    video = concatenate_videoclips(visuals, method="compose")
    video = video.set_audio(audio_clip)
    video.write_videofile("final_story_video.mp4", fps=24)

def process_audio_story(audio_file):
    segments = transcribe_audio(audio_file)
    create_video(audio_file, segments)

# ========== RUN MAIN ==========

if __name__ == "__main__":
    audio_input_path = "/kaggle/input/shwhwjhj/final_output_1 (2).wav"  # 👈 drop your audio file here
    process_audio_story(audio_input_path)


Loading Whisper model...
Loading Stable Diffusion...


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Transcribing: /kaggle/input/shwhwjhj/final_output_1 (2).wav
[1/9]  Smugly to the crowd, I bet I could outpace any one of you in a race, especially you
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Smugly to the crowd, I bet I could outpace any one of you in a race, especially you


  0%|          | 0/50 [00:00<?, ?it/s]

[2/9]  tortoise.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, tortoise


  0%|          | 0/50 [00:00<?, ?it/s]

[3/9]  Laugh smockingly.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Laugh smockingly


  0%|          | 0/50 [00:00<?, ?it/s]

[4/9]  Softly to hair with a knowing smile, we'll see hair let's race.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Softly to hair with a knowing smile, we'll see hair let's race


  0%|          | 0/50 [00:00<?, ?it/s]

[5/9]  Chuckling arrogantly, prepare to eat my dust tortoise.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Chuckling arrogantly, prepare to eat my dust tortoise


  0%|          | 0/50 [00:00<?, ?it/s]

[6/9]  Cling back tonnally, hope you're enjoying the view.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Cling back tonnally, hope you're enjoying the view


  0%|          | 0/50 [00:00<?, ?it/s]

[7/9]  Wisely with a gentle chuckle, slow and steady hair, remember that.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Wisely with a gentle chuckle, slow and steady hair, remember that


  0%|          | 0/50 [00:00<?, ?it/s]

[8/9]  Voice over, reflectively, and thus, the hair learned a valuable lesson that day.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Voice over, reflectively, and thus, the hair learned a valuable lesson that day


  0%|          | 0/50 [00:00<?, ?it/s]

[9/9]  Her confidence can lead to one's downfall, but slow and steady indeed wins the race.
[Gen] Creating image for: cute children's storybook illustration hare and tortoise, soft pastel colors, warm lighting, simple shapes, Her confidence can lead to one's downfall, but slow and steady indeed wins the race


  0%|          | 0/50 [00:00<?, ?it/s]

Moviepy - Building video final_story_video.mp4.
MoviePy - Writing audio in final_story_videoTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video final_story_video.mp4



                                                               

Moviepy - Done !
Moviepy - video ready final_story_video.mp4


In [7]:
from IPython.display import Video

Video("/kaggle/working/final_story_video.mp4", embed=True)
