In [1]:
from dotenv import load_dotenv
import os
import glob
import uuid
import tempfile

from nomic import embed, login as nomic_login
import ffmpeg
import openai
import pymongo

In [2]:
# Setup clients
load_dotenv("../backend/.env")

open_ai_client = openai.OpenAI(
    api_key=os.getenv("openai_api_key"),
)

nomic_login(token=os.environ["nomic_api_key"])
mongo_client = pymongo.MongoClient(os.environ["MONGO_URI"])

In [3]:
def video_to_images(video_path, output_path):
    # Use ffmpeg to extract frames at 1 frame per second
    (
        ffmpeg.input(video_path)
        .filter('fps', fps=2)
        .output(output_path + '/image_%04d.png')
        .run()
    )


# Example usage
video_path = 'inputs/video.mp4'
output_path = 'output_images'
video_to_images(video_path, output_path)

ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 14.0.0 (clang-1400.0.29.202)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.1.1_7 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libopenvino --enable-li

In [4]:
# create embeddings for images and add to MongoDB
db = mongo_client["final-db"]
collection = db["messi-video"]
images = glob.glob("output_images/*.png")

image_embeddings = embed.images(images)

for idx, embed in enumerate(image_embeddings['embeddings']):
  imageData = {"ts":idx, "vector-embedding":embed}
  collection.insert_one(imageData)

{'prompt_tokens': 3840, 'total_tokens': 3840}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}
{'prompt_tokens': 16384, 'total_tokens': 16384}


In [7]:
# add index for embeddings
# collection.create_index([('vector-embedding', 'vector')], name='final_index', numDimensions=768)

# Embeddings as JSON:
# {
#   "fields": [
#     {
#       "numDimensions": 768,
#       "path": "vector-embedding",
#       "similarity": "cosine",
#       "type": "vector"
#     }
#   ]
# }

In [None]:
# Read subtitles
def read_srt(file_path):
    with open(file_path, 'r') as file:
        return file.read()

messi_transcript = read_srt("inputs/messi.srt")

In [None]:
completion = openai_client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful video transcriber. You will be given the subtitles of a video and you need to convert it into text. Don't make things up. Just write what you hear. The user will provide you with the subtitles. Generate a summary of what is being said in the subtitles. The summary should not acknowledge the subtitles. Make sure to write in your own words and understand the context and meaning of the subtitles. Give out minimum 6 sentences."},
    {"role": "user", "content": messi_transcript}
  ]
)
text_summary = completion.choices[0].message.content

In [None]:
search_query_embed = embed.text(texts=[text_summary], task_type="search_query")

In [None]:
def make_video_clip(input_video_filename, output_video_filename, start_time, end_time):
    ffmpeg.input(input_video_filename, ss=start_time, to=end_time).output(output_video_filename).run(overwrite_output=True)

In [None]:
def combine_timestamps(input_video_filename, output_video_filename, timestamp_windows):
    # create clips of video for timestamp windows and save to temp dir
    # combine clips into video
    with tempfile.TemporaryDirectory() as tempdir:
        # generate clips from source video by time stamp window
        outfiles = []
        
        for window in timestamp_windows:
            out = f"{tempdir}/{str(uuid.uuid4())[:8]}.mp4"

            make_video_clip(input_video_filename, out, window[0], window[1])
            outfiles.append(f"file {out}")

        # Combine outfile paths into a txt file
        combined_file = f"{tempdir}/{str(uuid.uuid4())[:8]}.txt"

        with open(combined_file, "w") as fp:
            fp.write("\n".join(outfiles))

        # Combine clips using source files
        ffmpeg.input(combined_file, format='concat', safe=0).output(output_video_filename, c='copy').run(overwrite_output=True)


In [None]:
search_query = {
    "$vectorSearch":
    {
        "queryVector": search_query_embed["embeddings"][0],
        "path": "vector-embedding",
        "numCandidates": 100,
        "index": "final_index",
        "limit": 50
    }
}

In [None]:
pipeline = [search_query]
results = collection_imbeds.aggregate(pipeline)
results_as_dict = list(results)
timestamps = []
for result in results_as_dict:
    timestamps.append(result["ts"])

sT = sorted(timestamps)
sT = [i/2 for i in sT]
print(sT)

In [None]:
import datetime

final_timestamps = []
for timestamp in sT:
    final_timestamps.append((str(datetime.timedelta(seconds=timestamp)), str(datetime.timedelta(seconds=timestamp + 0.5))))

In [None]:
combine_timestamps("inputs/video.mp4", "outputs/out_demo.mp4", final_timestamps)

In [None]:
response = open_ai_client.audio.speech.create(model="tts-1",
  voice="alloy",
  input=text_summary
)
speech_file_path = "outputs/speech.mp3"
response.stream_to_file(speech_file_path)

In [None]:
def combine_audio_video(input_video_file, input_audio_file, output_video_file):
    input_video = ffmpeg.input(input_video_file)
    input_audio = ffmpeg.input(input_audio_file)

    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(output_video_file).run()

combine_audio_video("outputs/out_demo.mp4", speech_file_path, "outputs/final_demo_with_ai_voice.mp4")