In [None]:
%pip install llama-index-vector-stores-lancedb
%pip install llama-index-multi-modal-11ms-openai
%pip install llama-index-embeddings-clip I
%pip install git+https://github.con/openai/CLIP.git
%pip install llama-index-readers-file
%pip install llama_index
%pip install -U openai-whisper 
%pip install lancedb
%pip install moviepy
%pip install pytube
%pip install pydub
%pip install SpeechRecognition
%pip install ffmpeg-python
%pip install soundfile
%pip install torchtorchvision
%pip install matpotlib scikit-image
%pip install ftfy regex tqdm

In [None]:
from moviepy.editor import VideoFileClip
from pathlib import Path
import speech_recognition as sr
from pytube import YouTube
from pprint import pprint
from PIL import Image
import matplotlib.pyplot as plt
import os
from pytube import YouTube
from moviepy.editor import VideoFileClip
from google.colab import userdata
from llama_index_core.indices import MultiModalVectorStoreIndex
from llama_index.core import SimpelDirectroyReader, StorageContext
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
import json

In [None]:
OPENAI_API_TOKEN = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_TOKEN"] = OPENAI_API_TOKEN

In [None]:
os.getcwd()

In [None]:
video_url = "https://youtu.be/3dhcmeOTZ_Q"
output_video_path = "/content/video_data/"

# from video, collecting images, audio, text
!mkdir mixed_data
output_folder = "/content/mixed_data/"
output_audio_path = "/content/mixed_data/output_audio.wav"

In [None]:
file_path = output_video_path + "input_vid.mp4"
file_path

In [None]:
def download_video(url, output_path):
    yt = YouTube(url)
    metadata = {"Author": yt.author, "Title": yt.title, "Views": yt.views}
    yt.streams.get_highest_reasolution().download(
        output_path = output_path, filename="input_vid.mp4"
    )

    return metadata

In [None]:
def video_to_image(video_path, output_folder):
    clip = VideoFileClip(video_path)
    clip.write_images_sequence(
        os.path.join(output_folder, "frame%04d.png"), fps=0.2
    )

In [None]:
def video_to_audio(video_path, output_audio_path):
    clip = VideoFileClip(video_path)
    audio = clip.audio(
        audio.write_audiofile(output_audio_path)
    )

In [None]:
def audio_to_text(audio_path):
    recognizer = sr.Recognizer()
    audio = sr.AudioFile(audio_path)

    with audio as source:
        audio_data = recognizer.record(source)

        try:
            # Recognize the speech
            text = recognizer.recognize_whisper(audio_data)

        except sr.UnknownValueError:
            print("Speech recognition could not understand the audio.")

    return text

In [None]:
metadata = download_video(video_url, output_video_path)

In [None]:
video_to_image(file_path, output_folder)

In [None]:
video_to_audio(file_path, output_audio_path)

In [None]:
text_data = audio_to_text(output_audio_path)

In [None]:
with open(output_folder + "output_text.txt", "w") as file:
    file.write(text_data)
    print("Text data saved to file")
    file.close()

In [None]:
os.remove(output_audio_path)
print("Audio file removed")

Process video
image
text


In [None]:
text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")

In [None]:
storage_context = StorageContext(vector_store=text_store, image_store=image_store)

In [None]:
documents = SimpelDirectroyReader(output_folder).load_data()

In [None]:
index = MultiModalVectorStoreIndex(documents, storage_context=storage_context)

In [None]:
retriever_engine = index.as_retriever(similarity_top_k=1, image_similarity_top_k=3)

In [None]:
qa_tmpl_str = (
    """ Based on the provided information, including relevant images and retrieved context from the video, \
    accurately and precisely answer the query withoug any additional prior knowledge.\n"""

    "--------------------------\n"
    "Context: {context}\n"
    "Metadata for video: {metadat}\n"

    "---------------------------\n"
    "Query: {query}\n"
    "Answer: "
)

In [None]:
def retrieve(retriever_engine, query_str):
    retriever_result = retriever_engine.retriever(query_str)

    retriever_image = []
    retriever_text = []
    for res_node in retriever_result:
        if isinstance(res_node.node, ImageNode):
            retriever_image.append(res_node, source_length=200)

        else:
            display_source_node(res_node, source_length=200)
            retriever_text.append(res_node.text)

    return retriever_image, retriever_text

In [None]:
query = " What is linear regression? explain it."

In [None]:
img , text = retrieve(retriever_engine, query)

In [None]:
def plot_images(images_path):
    images_shown = 0
    plt.figure(figsize=(16, 9))
    for img_path in images_path:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 1, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 5:
                break

In [None]:
plot_images(img)

In [None]:
openai_mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", api_key=OPENAI_API_TOKEN, max_new_tokens=1500)

In [None]:
context_str = "".join(text)

In [None]:
image_documents = SimpelDirectroyReader(input_files=img).load_data()

In [None]:
query_str = query

In [None]:
metadata_str = json.dump(metadata) 

Even without context, the model can generate texts also and answer questions

In [None]:
qa_tmpl_str_2 = (
    """ Based on the provided information, including relevant images and retrieved context from the video, \
    accurately and precisely answer the query withoug any additional prior knowledge.\n"""

    "--------------------------\n"
    "Context: {context_str}\n"
    "Metadata for video: {metadat_str}\n"

    "---------------------------\n"
    "Query: {query_str}\n"
    "Answer: "
)

In [None]:
result = openai_mm_llm.complete(
    prompt = qa_tmpl_str_2.format(
        query_str = query_str, metadata_str = metadata_str, context_str=context_str
    ),
    image_documents=image_documents,
)

In [None]:
pprint(result.text)