In [1]:
import numpy as np
from tqdm import tqdm
import os
import json

from videograph import VideoGraph
from utils.general import *
from utils.video_processing import *
from utils.chat_api import *
from prompts import *

from face_processing import process_faces
from voice_processing import process_voices
from memory_processing import (
    process_captions,
    generate_captions_and_thinkings_with_ids,
)
from retrieve import answer_with_retrieval

processing_config = json.load(open("configs/processing_config.json"))
memory_config = json.load(open("configs/memory_config.json"))



In [2]:
def process_segment(video_graph, base64_video, base64_frames, base64_audio, clip_id):

    id2voices = process_voices(video_graph, base64_audio, base64_video)
    print("Finish processing voices")

    print(f"processing {len(base64_frames)} frames...")

    id2faces = process_faces(video_graph, base64_frames)
    # print(id2faces.keys())
    print("Finish processing faces")

    episodic_captions, semantic_captions = generate_captions_and_thinkings_with_ids(
        video_graph,
        base64_video,
        base64_frames,
        base64_audio,
        id2faces,
        id2voices,
    )

    process_captions(video_graph, episodic_captions, clip_id, type="episodic")
    process_captions(video_graph, semantic_captions, clip_id, type="semantic")

    print("Finish processing segment")


def streaming_process_video(
    video_graph, video_path, interval_seconds, fps, segment_limit=None
):
    """Process video segments at specified intervals with given fps.

    Args:
        video_graph (VideoGraph): Graph object to store video information
        video_path (str): Path to the video file or directory containing clips
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None: Updates video_graph in place with processed segments
    """
    if os.path.isfile(video_path):
        # Process single video file
        video_info = get_video_info(video_path)
        print(video_info)

        # Process each interval
        count = 0
        for start_time in tqdm(np.arange(0, video_info["duration"], interval_seconds)):
            if start_time + interval_seconds > video_info["duration"]:
                break

            print("=" * 20)

            print(f"Loading {count}-th clip starting at {start_time} seconds...")
            base64_video, base64_frames, base64_audio = process_video_clip(
                video_path, start_time, interval_seconds, fps, audio_format="wav"
            )

            # check dtype
            # print(type(base64_video), type(base64_frames[0]), type(base64_audio))

            # Process frames for this interval
            if base64_frames:
                print(
                    f"Starting processing {count}-th clip starting at {start_time} seconds..."
                )
                process_segment(
                    video_graph, base64_video, base64_frames, base64_audio, count
                )

            count += 1

            if segment_limit is not None and count >= segment_limit:
                break

    elif os.path.isdir(video_path):
        # Process directory of numbered clips
        files = os.listdir(video_path)
        # Filter for video files and sort by numeric value in filename
        video_files = [
            f for f in files if any(f.endswith(ext) for ext in [".mp4", ".avi", ".mov"])
        ]
        video_files.sort(key=lambda x: int("".join(filter(str.isdigit, x))))

        for count, video_file in enumerate(tqdm(video_files)):
            if segment_limit is not None and count >= segment_limit:
                break
            print("=" * 20)
            full_path = os.path.join(video_path, video_file)
            print(f"Starting processing {count}-th clip: {full_path}")

            base64_video, base64_frames, base64_audio = process_video_clip(
                full_path, 0, None, fps, audio_format="wav"
            )

            if base64_frames:
                process_segment(
                    video_graph, base64_video, base64_frames, base64_audio, count
                )

In [1]:
# video paths can be paths to directories or paths to mp4 files
video_paths = processing_config["video_paths"]

for video_path in video_paths:

    video_graph = VideoGraph(**memory_config)

    streaming_process_video(
        video_graph,
        video_path,
        processing_config["interval_seconds"],
        processing_config["fps"],
        processing_config["segment_limit"],
    )

    video_graph.refresh_equivalences()

    save_dir = "data/video_graphs"
    save_video_graph(
        video_graph, video_path, save_dir, (processing_config, memory_config)
    )

NameError: name 'processing_config' is not defined

In [2]:
video_graph_path = "data/video_graphs/5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75.pkl"
video_graph = load_video_graph(video_graph_path)
# for text_node in video_graph.text_nodes:
#     print(video_graph.nodes[text_node].metadata['contents'])
# for nodes, weight in video_graph.edges.items():
#     if weight > 1:
#         if video_graph.nodes[nodes[0]].type in ["episodic", "semantic"]:
#            print(video_graph.nodes[nodes[0]].metadata['contents'])
#         else:
#            print(video_graph.nodes[nodes[1]].metadata['contents'])
#         print(weight)

video_graph.text_matching_threshold = processing_config["retrieval_threshold"]
# video_graph.refresh_equivalences()

# question = 'What does Demar Randy wear?'
# question = "Who has an OnlyFans account?"
question = "What are the people doing in the video?"
answer = answer_with_retrieval(
    video_graph,
    question,
    query_num=processing_config["query_num"],
    topk=processing_config["topk"],
)

# video_graph.summarize(logging=True)
# save_dir = "data/video_graphs"
# save_video_graph(
#     video_graph, None, save_dir, None, file_name='5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75_augmented.pkl'
# )
# video_graph.visualize()

Loading video graph from data/video_graphs/5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75.pkl
Generating queries 0 times


2025-03-31 12:50:52,173 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Queries: ['What actions are performed by <face_0> in the video?', 'What actions are performed by <face_1> in the video?', 'What actions are performed by <face_4> in the video?', 'What interactions occur between <face_4> and <face_9>?', 'What tasks or activities are assigned by <voice_0> to the individuals?', 'What is the purpose of the meeting or gathering in the video?', 'What are the physical gestures or movements of <face_9>?', 'What are the roles or responsibilities of the individuals introduced by <voice_0>?', "What is the significance of <face_1>'s role as an executive or presenter?", "What are the reactions or responses of the participants to <voice_0>'s introduction?"]
New memories from clip 3: ['In a brightly lit studio setting, <face_10>, <face_8>, <face_9>, <face_11>, and <face_12> stand facing four individuals seated at a table draped in black cloth: <face_7>, <character_0>, <face_6>, and <face_5>.', "<face_10> wears a black jacket and light blue jeans. <face_9> is dressed 

KeyError: '\n    "clip_1"'

In [None]:
from utils.chat_api import *
from utils.general import plot_cosine_similarity_distribution

video_graph_path = "data/video_graphs/5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75.pkl"
video_graph = load_video_graph(video_graph_path)

graph_embeddings = []

for id, node in video_graph.nodes.items():
    if node.type in ["episodic", "semantic"]:
        graph_embeddings.extend(node.embeddings)

# texts = ["Clothing style of Demar Randy", "<voice_44> introduces himself as Demar Randy."]
texts = ["<face_4> points at <face_9>."]
embs = parallel_get_embedding("text-embedding-3-large", texts)[0]

plot_cosine_similarity_distribution(graph_embeddings, embs)

In [None]:
for text_node in video_graph.text_nodes:
    print(video_graph.nodes[text_node].metadata["contents"])

In [None]:
video_graph.visualize()

In [None]:
# from retrieve import retrieve_from_videograph
# from videograph import VideoGraph
# from utils.chat_api import (
#     generate_messages,
#     get_response_with_retry,
#     parallel_get_embedding,
# )
# from utils.general import validate_and_fix_python_list
# from prompts import prompt_memory_retrieval

# MAX_RETRIES = 3


# def generate_queries(question, existing_knowledge=None, query_num=1):
#     input = [
#         {
#             "type": "text",
#             "content": prompt_memory_retrieval.format(
#                 question=question,
#                 query_num=query_num,
#                 existing_knowledge=existing_knowledge,
#             ),
#         }
#     ]
#     messages = generate_messages(input)
#     model = "gpt-4o-2024-11-20"
#     queries = None
#     for i in range(MAX_RETRIES):
#         print(f"Generating queries {i} times")
#         queries = get_response_with_retry(model, messages)[0]
#         queries = validate_and_fix_python_list(queries)
#         if queries is not None:
#             break
#     if queries is None:
#         raise Exception("Failed to generate queries")
#     return queries


# def retrieve_from_videograph(videograph, question, topk=3):
#     queries = generate_queries(question)
#     print(f"Queries: {queries}")

#     model = "text-embedding-3-large"
#     query_embeddings = parallel_get_embedding(model, queries)[0]

#     related_nodes = []

#     for query_embedding in query_embeddings:
#         nodes = videograph.search_text_nodes(query_embedding)
#         related_nodes.extend(nodes)

#     related_nodes = list(set(related_nodes))
#     return related_nodes


# question = "Denny"
# retrieved_nodes = retrieve_from_videograph(video_graph, question)
# print(retrieved_nodes)