In [1]:
import os
import time
import torch
import numpy as np

from univtg.run_on_video.video_extractor import clip, vid2clip, txt2clip
from univtg.utils.basic_utils import l2_normalize_np_array
from univtg.main.config import TestOptions, setup_model
import torch.backends.cudnn as cudnn
import logging
import sys
from IPython import get_ipython
import argparse

In [4]:
EMB_DIR = "./embeddings"
MODEL_CKPT = "./univtg/ckpts/model_raw.ckpt"
GPU_ID = 0
VIDEO_DIR = "./footages"

os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU_ID)

model_version = "ViT-B/32"
clip_len = 2

clip_model, _ = clip.load(model_version, device=GPU_ID, jit=False)

def load_vtg_model(
    resume_path=MODEL_CKPT,
    save_dir=EMB_DIR,
    gpu_id=GPU_ID
):
    if get_ipython():

        """
        Safe loader for VTG inside a Jupyter notebook.
        Avoids argparse picking up Jupyter kernel arguments.
        """
        
        # ---- 1. Backup sys.argv to avoid argparse conflicts ----
        argv_backup = sys.argv.copy()
        
        # ---- 2. Replace sys.argv so parser gets a clean list ----
        sys.argv = [
            "notebook",
            "--resume", resume_path,
            #"--save_dir", save_dir,
            "--gpu_id", str(gpu_id)
        ]

        # ---- 3. Parse options ----
        opt = TestOptions().parse()

        # ---- Restore sys.argv back so Jupyter works normally ----
        sys.argv = argv_backup

        # ---- 4. Setup model ----
        cudnn.benchmark = True
        cudnn.deterministic = False

        # LR warmup setup (copied from your code)
        if opt.lr_warmup > 0:
            total_steps = opt.n_epoch
            warmup_steps = (
                opt.lr_warmup if opt.lr_warmup > 1 
                else int(opt.lr_warmup * total_steps)
            )
            opt.lr_warmup = [warmup_steps, total_steps]

        # ---- 5. Build model ----
        model, criterion, _, _ = setup_model(opt)
    else: ## running as a regular python script
        
        parser = argparse.ArgumentParser(description='')
        parser.add_argument('--save_dir', type=str, default=EMB_DIR)
        parser.add_argument('--resume', type=str, default=MODEL_CKPT)
        parser.add_argument("--gpu_id", type=int, default=GPU_ID)
        args = parser.parse_args()

        opt = TestOptions().parse(args)
        # pdb.set_trace()
        cudnn.benchmark = True
        cudnn.deterministic = False

        if opt.lr_warmup > 0:
            total_steps = opt.n_epoch
            warmup_steps = opt.lr_warmup if opt.lr_warmup > 1 else int(opt.lr_warmup * total_steps)
            opt.lr_warmup = [warmup_steps, total_steps]

        model, criterion, _, _ = setup_model(opt)


    return model, opt

vtg_model, vtg_opt = load_vtg_model()

0


2025-12-09 12:07:19.030:INFO:univtg.main.config - setup model/optimizer/scheduler
2025-12-09 12:07:19.115:INFO:univtg.main.config - CUDA enabled.
2025-12-09 12:07:19.133:INFO:univtg.main.config - Load checkpoint from ./univtg/ckpts/model_raw.ckpt


|                     | 0                                                                                                                        |
|:--------------------|:-------------------------------------------------------------------------------------------------------------------------|
| dset_type           | vlp                                                                                                                      |
| dset_name           | vlp                                                                                                                      |
| domain_name         |                                                                                                                          |
| model_id            | univtg                                                                                                                   |
| exp_id              | omni_mini_aio_unified__epo3_f10_b10g1_s0.1_0.1                                                

2025-12-09 12:07:19.251:INFO:univtg.main.config - Loaded model saved at epoch 1 from checkpoint: ./univtg/ckpts/model_raw.ckpt


In [None]:
def convert_to_hms(seconds):
    return time.strftime('%H:%M:%S', time.gmtime(seconds))

def load_cached_features(save_dir):
    vid = np.load(os.path.join(save_dir, 'vid.npz'))['features'].astype(np.float32)
    txt = np.load(os.path.join(save_dir, 'txt.npz'))['features'].astype(np.float32)

    vid = torch.from_numpy(l2_normalize_np_array(vid))
    txt = torch.from_numpy(l2_normalize_np_array(txt))

    ctx_len = vid.shape[0]

    timestamp = ((torch.arange(ctx_len) + clip_len/2) / ctx_len).unsqueeze(1).repeat(1, 2)

    # Add temporal extent feature (TEF)
    tef_st = torch.arange(ctx_len) / ctx_len
    tef_ed = tef_st + 1 / ctx_len
    tef = torch.stack([tef_st, tef_ed], dim=1)

    vid = torch.cat([vid, tef], dim=1)

    src_vid = vid.unsqueeze(0).cuda()
    src_txt = txt.unsqueeze(0).cuda()
    m_vid  = torch.ones(src_vid.shape[:2]).cuda()
    m_txt  = torch.ones(src_txt.shape[:2]).cuda()

    return src_vid, src_txt, m_vid, m_txt, timestamp, ctx_len


In [None]:
def run_query(model, query, save_dir):
    src_vid, src_txt, m_vid, m_txt, timestamp, ctx_len = load_cached_features(save_dir)

    model.eval()
    with torch.no_grad():
        out = model(src_vid=src_vid, src_txt=src_txt,
                    src_vid_mask=m_vid, src_txt_mask=m_txt)
    out = {k:v.cpu() for k,v in out.items()}

    return out


In [None]:
def extract_video(video_path):
    print("Extracting video features...")
    vid2clip(clip_model, video_path, SAVE_DIR)
    print("Done.")

def embed_text(query):
    txt2clip(clip_model, query, SAVE_DIR)