In [10]:
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp

model = "tiny" # could be "medium" or "large-v2". medium is probably the most pragmatic balance
pipeline = FlaxWhisperPipline(f"openai/whisper-{model}", dtype=jnp.bfloat16)


Downloading (…)rocessor_config.json: 100%|██████████| 185k/185k [00:00<00:00, 1.44MB/s]
Downloading tokenizer_config.json: 100%|██████████| 283k/283k [00:00<00:00, 2.32MB/s]
Downloading vocab.json: 100%|██████████| 836k/836k [00:00<00:00, 3.17MB/s]
Downloading tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 8.96MB/s]
Downloading merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 3.88MB/s]
Downloading normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 59.0MB/s]
Downloading added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 18.7MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.19k/2.19k [00:00<00:00, 6.27MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading config.json: 100%|██████████| 1.98k/1.98k [00:00<00:00, 4.06MB/s]
Downloading flax_model.ms

In [11]:
# JIT compile the forward call - slow, but we only do once
# testing on a minute long clip
text = pipeline("output.mp3", return_timestamps=True)

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


In [16]:
# JAXed up
text = pipeline("test.mp3", return_timestamps=True)
text

{'text': " In the last chapter, you and I started to step through the internal workings of a transformer. This is one of the key pieces of technology inside large language models, and a lot of other tools in the modern way of AI. It first hit the scene and a now-famous 2017 paper called Attention as All You Need, and in this chapter, you and I will dig into what this attention mechanism is, visualizing how it processes data. As a quick recap, here's the important context I want you to have in mind. The goal of the model that you and I are studying is to take in a piece of text and predict what word comes next. The input text is broken up into little pieces that we call tokens, and these are very often words or pieces of words, but just to make the examples in this video easier for you and me to think about, let's simplify by pretending that tokens are always just words. The first step in a transformer is to associate each token with a high-dimensional vector, what we call its embedding

In [35]:
def convert_transcript_to_json(transcript):
    return [
            {
                "text": chunk['text'].strip(),
                "start": chunk['timestamp'][0],
                "end": chunk['timestamp'][1]
            }
            for chunk in transcript['chunks']
        ]
    processed_chunks = [f"{i}: {chunk['text']}" for i, chunk in enumerate(transcript['chunks'], start=1)]
    return "\n".join(processed_chunks)

processed_transcript = convert_transcript_to_json(text)
print(processed_transcript)


[{'text': 'In the last chapter, you and I started to step through the internal workings of a transformer.', 'start': 0.0, 'end': 4.4}, {'text': 'This is one of the key pieces of technology inside large language models,', 'start': 4.4, 'end': 7.84}, {'text': 'and a lot of other tools in the modern way of AI.', 'start': 7.84, 'end': 10.8}, {'text': 'It first hit the scene and a now-famous 2017 paper called Attention as All You Need,', 'start': 10.8, 'end': 15.52}, {'text': 'and in this chapter, you and I will dig into what this attention mechanism is,', 'start': 15.52, 'end': 19.68}, {'text': 'visualizing how it processes data.', 'start': 19.68, 'end': 21.6}, {'text': "As a quick recap, here's the important context I want you to have in mind.", 'start': 26.48, 'end': 30.12}, {'text': 'The goal of the model that you and I are studying is to take in a piece of text and predict', 'start': 30.12, 'end': 34.72}, {'text': 'what word comes next.', 'start': 34.72, 'end': 36.8}, {'text': 'The inp

In [23]:
from pathlib import Path

def get_video_files(data_directory):
    video_files = [file for file in data_directory.glob('*') if file.suffix.lower() in ['.mp4', '.mov', '.avi', '.mkv']]
    return video_files

data_directory = Path('data')
video_files = get_video_files(data_directory)
print("Video files in 'data' directory:", video_files)



Video files in 'data' directory: [PosixPath('data/IMG_1406.MOV'), PosixPath('data/IMG_1407.MOV'), PosixPath('data/IMG_1411.MOV'), PosixPath('data/IMG_1405.MOV'), PosixPath('data/IMG_1404.MOV'), PosixPath('data/IMG_1410.MOV'), PosixPath('data/IMG_1399.MOV'), PosixPath('data/IMG_1414.MOV'), PosixPath('data/IMG_1400.MOV'), PosixPath('data/IMG_1415.MOV'), PosixPath('data/IMG_1398.MOV'), PosixPath('data/IMG_1417.MOV'), PosixPath('data/IMG_1416.MOV'), PosixPath('data/IMG_1444.MOV'), PosixPath('data/IMG_1446.MOV'), PosixPath('data/IMG_1442.MOV'), PosixPath('data/IMG_1441.MOV'), PosixPath('data/IMG_1382.MOV'), PosixPath('data/IMG_1396.MOV'), PosixPath('data/IMG_1427.MOV'), PosixPath('data/IMG_1432.MOV'), PosixPath('data/IMG_1426.MOV'), PosixPath('data/IMG_1397.MOV'), PosixPath('data/IMG_1383.MOV'), PosixPath('data/IMG_1381.MOV'), PosixPath('data/IMG_1418.MOV'), PosixPath('data/IMG_1419.MOV'), PosixPath('data/IMG_1425.MOV'), PosixPath('data/IMG_1431.MOV'), PosixPath('data/IMG_1384.MOV'), PosixP

In [25]:
import subprocess

# Function to extract mp3 from mp4
def extract_mp3_from_video(mp4_file_path, output_directory):
    output_mp3_path = output_directory / f"{mp4_file_path.stem}.mp3"
    command = f"ffmpeg -i {mp4_file_path} {output_mp3_path}"
    subprocess.run(command, shell=True)
    return output_mp3_path

# Create a directory for the extracted mp3 files if it doesn't exist
mp3_directory = data_directory / 'mp3'
mp3_directory.mkdir(exist_ok=True)

# Extract mp3 files from each mp4 file
extracted_mp3_files = [extract_mp3_from_video(mp4_file, mp3_directory) for mp4_file in video_files]
print("Extracted MP3 files:", extracted_mp3_files)


ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.1.0.2.5)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.1.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libopen

Extracted MP3 files: [PosixPath('data/mp3/IMG_1406.mp3'), PosixPath('data/mp3/IMG_1407.mp3'), PosixPath('data/mp3/IMG_1411.mp3'), PosixPath('data/mp3/IMG_1405.mp3'), PosixPath('data/mp3/IMG_1404.mp3'), PosixPath('data/mp3/IMG_1410.mp3'), PosixPath('data/mp3/IMG_1399.mp3'), PosixPath('data/mp3/IMG_1414.mp3'), PosixPath('data/mp3/IMG_1400.mp3'), PosixPath('data/mp3/IMG_1415.mp3'), PosixPath('data/mp3/IMG_1398.mp3'), PosixPath('data/mp3/IMG_1417.mp3'), PosixPath('data/mp3/IMG_1416.mp3'), PosixPath('data/mp3/IMG_1444.mp3'), PosixPath('data/mp3/IMG_1446.mp3'), PosixPath('data/mp3/IMG_1442.mp3'), PosixPath('data/mp3/IMG_1441.mp3'), PosixPath('data/mp3/IMG_1382.mp3'), PosixPath('data/mp3/IMG_1396.mp3'), PosixPath('data/mp3/IMG_1427.mp3'), PosixPath('data/mp3/IMG_1432.mp3'), PosixPath('data/mp3/IMG_1426.mp3'), PosixPath('data/mp3/IMG_1397.mp3'), PosixPath('data/mp3/IMG_1383.mp3'), PosixPath('data/mp3/IMG_1381.mp3'), PosixPath('data/mp3/IMG_1418.mp3'), PosixPath('data/mp3/IMG_1419.mp3'), PosixP

[out#0/mp3 @ 0x125e49f40] video:0kB audio:452kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.150428%
size=     453kB time=00:00:28.86 bitrate= 128.4kbits/s speed=73.5x    


In [36]:
# Transcribe each MP3 file using the Whisper pipeline with timestamps and save the transcripts
import json

def transcribe_and_save(mp3_file_path, output_directory):
    transcription_result = pipeline(str(mp3_file_path), return_timestamps=True)
    transcript_json = convert_transcript_to_json(transcription_result)
    
    # Define the output path for the transcript
    output_transcript_path = output_directory / f"{mp3_file_path.stem}.json"
    
    # Write the transcription to a file as a JSON object
    with open(output_transcript_path, 'w') as file:
        json.dump(transcript_json, file)
    
    return output_transcript_path

# Create a directory for the transcripts if it doesn't exist
transcript_directory = data_directory / 'transcripts'
transcript_directory.mkdir(exist_ok=True)

# Transcribe each MP3 file and save the transcripts with timestamps
transcribed_files = [transcribe_and_save(mp3_file, transcript_directory) for mp3_file in extracted_mp3_files]
print("Transcribed files saved with timestamps:", transcribed_files)



Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Transcribed files saved with timestamps: [PosixPath('data/transcripts/IMG_1406.json'), PosixPath('data/transcripts/IMG_1407.json'), PosixPath('data/transcripts/IMG_1411.json'), PosixPath('data/transcripts/IMG_1405.json'), PosixPath('data/transcripts/IMG_1404.json'), PosixPath('data/transcripts/IMG_1410.json'), PosixPath('data/transcripts/IMG_1399.json'), PosixPath('data/transcripts/IMG_1414.json'), PosixPath('data/transcripts/IMG_1400.json'), PosixPath('data/transcripts/IMG_1415.json'), PosixPath('data/transcripts/IMG_1398.json'), PosixPath('data/transcripts/IMG_1417.json'), PosixPath('data/transcripts/IMG_1416.json'), PosixPath('data/transcripts/IMG_1444.json'), PosixPath('data/transcripts/IMG_1446.json'), PosixPath('data/transcripts/IMG_1442.json'), PosixPath('data/transcripts/IMG_1441.json'), PosixPath('data/transcripts/IMG_1382.json'), PosixPath('data/transcripts/IMG_1396.json'), PosixPath('data/transcripts/IMG_1427.json'), PosixPath('data/transcripts/IMG_1432.json'), PosixPath('da

In [27]:
import os

# Load OpenAI API key from environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
print("OpenAI API key loaded successfully.")


OpenAI API key loaded successfully.


In [43]:
import openai
import json

client = openai.OpenAI()

# Load OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key is None:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")

def select_storyboard_segments(transcript_files, goal):
    # Define the prompt for selecting relevant segments
    transcripts = []
    for transcript_file in transcript_files:
        with open(transcript_file, 'r') as file:
            transcript = json.load(file)
            parsed_transcript = transcript_file.stem + "\n" + "\n".join(f"{i}: {x['text']}" for i, x in enumerate(transcript, start=1))
            transcripts.append(parsed_transcript)

    system_prompt = f"""You are producing a storyboard for a video with the following instruction: `{goal}`. Your goal is to select sentences from each of the given transcripts to stitch together into a final video.\
Return a json file with a list of tuples named `segments`, each containing a `video` field with the filename and a `idx` field with the index of the sentence in that video."""
    user_prompt = "\n\n".join(transcripts)
    print(system_prompt)
    print(user_prompt)
    
    # Get the response from GPT-4
    response = client.chat.completions.create(

    model="gpt-4o",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0
    )

    relevant_segments = response.choices[0].message.content.strip()
    
    return relevant_segments

# Define the high level goal
high_level_goal = "make an advertising video"

# Select relevant segments for the storyboard
storyboard = json.loads(select_storyboard_segments(transcribed_files, high_level_goal))
print("Storyboard segments selected:", json.dumps(storyboard, indent=4))



You are producing a storyboard for a video with the following instruction: `make an advertising video`. Your goal is to select sentences from each of the given transcripts to stitch together into a final video.Return a json file with a list of tuples named `segments`, each containing a `video` field with the filename and a `idx` field with the index of the sentence in that video.
IMG_1406
1: Oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, o

In [48]:
from moviepy.editor import VideoFileClip, concatenate_videoclips
import json

#storyboard = json.loads(storyboard)

# List to hold video clips
clips = []

# Process each segment
for segment in storyboard['segments']:
    video_file = f"data/{segment['video']}.MOV"
    transcript_file = f"data/transcripts/{segment['video']}.json"
    
    # Load the transcript data
    with open(transcript_file, 'r') as file:
        transcript = json.load(file)
    
    # Get the start and end times for the clip
    clip_info = transcript[segment['idx'] - 1]  # Adjust index since JSON is 0-indexed
    start_time = clip_info['start']
    end_time = clip_info['end']
    
    # Load the video clip from start to end times
    video_clip = VideoFileClip(video_file).subclip(start_time, end_time)
    clips.append(video_clip)

# Concatenate all clips into one final video
final_clip = concatenate_videoclips(clips)
final_clip.write_videofile("final_output.mp4")

Moviepy - Building video final_output.mp4.
MoviePy - Writing audio in final_outputTEMP_MPY_wvf_snd.mp3


                                                                    

MoviePy - Done.
Moviepy - Writing video final_output.mp4



                                                                

Moviepy - Done !
Moviepy - video ready final_output.mp4
