# 1. Define  MaxInfo Block

In [None]:
# define svd 
"""
r is the region of the slice
list_of_feature is video -> images -> features, rows is frame's features
"""
def svd_reduce_dim(list_of_feature, r = 6):

    u, s, v = np.linalg.svd(list_of_feature, full_matrices=False)
    X = u
    output = X[:, :r]
     
    return output

# define max vol
"""
piv is select of key_frames index. 
initail_M is svd output.
"""
import numpy as np

from maxvolpy.maxvolpy.maxvol import rect_maxvol

def max_vol_extract_index(initail_M, tol, minK=1, maxK=500):
    
    piv, C = rect_maxvol(initail_M, tol = 0.1)
    
    print(np.allclose(initail_M, C.dot(initail_M[piv])))
    
    return piv

import torch


def reduce_video_frames(vision_model, vision_processor, video_input, max_frames=50):
    
    def auto_select_device():
        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()
            for i in range(num_gpus):
                try:
                    device = torch.device(f"cuda:{i}")
                    with torch.cuda.device(device):
                        pass
                    return device 
                except:
                    continue
        return torch.device("cpu")

    device = auto_select_device()
    
    image_features = []
    
    for input in video_input:
        
        with torch.no_grad():
            inputs = vision_processor(images=input, return_tensors="pt", padding=True).to(device)
            image_feature = vision_model.get_image_features(**inputs)
            image_feature = image_feature[0]

        image_features.append(image_feature.cpu())

    low_dimensions = svd_reduce_dim(image_features, r = 8

    selections = max_vol_extract_index(low_dimensions, tol = 0.30)

    selections.sort()
    
    filtered_frames = selections

    if len(filtered_frames) > 1 and len(filtered_frames) % 2 != 0:
        filtered_frames = filtered_frames[:-1]
    
    selected_pixels = video_input[filtered_frames]
    
    return selected_pixels

def reduce_video_frames1(video_input, max_frames=768):
    """
    Reduce the number of frames in a video tensor to a maximum of max_frames.
    
    Args:
    video_input (torch.Tensor): Input video tensor of shape [num_frames, channels, height, width]
    max_frames (int): Maximum number of frames in the output tensor
    
    Returns:
    torch.Tensor: Video tensor with reduced number of frames
    """
    num_frames, channels, height, width = video_input.shape
    
    if num_frames <= max_frames:
        return video_input
    
    # Calculate indices of frames to keep
    keep_indices = torch.linspace(0, num_frames - 1, max_frames).long()
    
    # Select frames
    reduced_video = video_input[keep_indices]
    
    return reduced_video



# 2. Define Utils

In [None]:
import os
import re
import math
import json
import argparse
import warnings
import traceback
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers import AutoProcessor, CLIPModel
from qwen_vl_utils import process_vision_info

warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]
class EgoschemaDataset(Dataset):
    video_formats = ['.mp4', '.avi', '.mov', '.mkv']

    def __init__(self, data_folder, data_list):
        self.data_folder = data_folder
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        line = self.data_list[idx]
        q_uid = line['q_uid']

        for fmt in self.video_formats:
            temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
            if os.path.exists(temp_path):
                video_path = temp_path
                break

        video_input = {"type": "video", "video": video_path, "fps": 2.0}
        question = line['question']
        a0, a1, a2, a3, a4 = line['option 0'], line['option 1'], line['option 2'], line['option 3'], line['option 4']
        instruct = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'

        return {
            'q_uid': q_uid,
            'video': video_input, 
            'instruct': instruct,
        }

def build_egoschema_eval(args):
    questions = json.load(open(args.question_file, "r"))
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    dataset = EgoschemaDataset(args.video_folder, questions)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
    return dataloader

def convert_dict(orig_dict):
    new_dict = {}
    
    # Process each item
    new_dict['type'] = orig_dict['type'][0]
    new_dict['video'] = orig_dict['video'][0]  # replace with the actual path if needed
    new_dict['max_pixels'] = 360 * 420  # calculated from the context or could use the original value
    new_dict['fps'] = float(orig_dict['fps'][0])  # Convert tensor to float
    
    return new_dict

In [None]:
import sys
import argparse
# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
from PIL import Image
import requests
import copy
import torch
import sys
import warnings
from decord import VideoReader, cpu
import numpy as np

def load_video(video_path, max_frames_num,fps=1,force_sample=False):
    if max_frames_num == 0:
        return np.zeros((1, 336, 336, 3))
    vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
    total_frame_num = len(vr)
    video_time = total_frame_num / vr.get_avg_fps()
    fps = round(vr.get_avg_fps()/fps)
    frame_idx = [i for i in range(0, len(vr), fps)]
    frame_time = [i/fps for i in frame_idx]
    if len(frame_idx) > max_frames_num or force_sample:
        sample_fps = max_frames_num
        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
    spare_frames = vr.get_batch(frame_idx).asnumpy()
    # import pdb;pdb.set_trace()
    return spare_frames,frame_time,video_time
warnings.filterwarnings("ignore")

In [None]:
def run_inference(args):
    
    pretrained = args.model_path
    model_name = args.model_name
        
    device = "cuda"
    device_map = "auto"
    tokenizer, model, processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)
    model.eval()
    
    vision_model_path = "openai/clip-vit-large-patch14-336"
    vision_model = CLIPModel.from_pretrained(vision_model_path, device_map="auto")
    vision_processor = AutoProcessor.from_pretrained(vision_model_path)

    answer_file = os.path.expanduser(args.answer_file)
    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
    ans_file = open(answer_file, "w")

    val_loader = build_egoschema_eval(args)
    frames = []

    for batch in tqdm(val_loader):
 
        q_uid = batch['q_uid'][0] if isinstance(batch['q_uid'], list) else batch['q_uid']
        
        instruct = batch['instruct'][0] if isinstance(batch['instruct'], list) else batch['instruct']

        video = convert_dict(batch['video'])

        try:

            max_frames_num = 128
        
            video,frame_time,video_time = load_video(video['video'], max_frames_num, 1, force_sample=True)
            
            reduced_video_inputs = reduce_video_frames(vision_model, vision_processor, video)

            reduced_video_inputs = reduce_video_frames1(reduced_video_inputs, 64)
            
            frames_str = len(reduced_video_inputs)

            frames.append(frames_str)
                
            video = processor.preprocess(reduced_video_inputs, return_tensors="pt")["pixel_values"].cuda().half()
            
            video = [video]
                    
            conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
            # time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
                    
            question = DEFAULT_IMAGE_TOKEN + instruct
                    
            conv = copy.deepcopy(conv_templates[conv_template])
                    
            conv.append_message(conv.roles[0], question)
                    
            conv.append_message(conv.roles[1], None)
                    
            prompt_question = conv.get_prompt()
                    
            input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
                    
            cont = model.generate(
                input_ids,
                images=video,
                modalities= ["video"],
                do_sample=False,
                temperature=0,
                max_new_tokens=4096,
            )

            output = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip().rstrip(".")
            
            print(output)
            
            # Prepare a dictionary for egoschema_dump
            result = {
                'q_uid': q_uid,
                'instruct': instruct,
                'frames_str': frames_str
            }

            print(result['frames_str'])
            
            egoschema_dump(ans_file, result, output)
        except Exception as e:
            print(f"Error processing q_uid {q_uid}: {str(e)}")
            ans_file.write(f'{q_uid}, -1\n')

    ans_file.close()

    file_path = 'frames-informations'
    
    with open(file_path, 'w') as json_file:
        json.dump(frames, json_file)

def egoschema_dump(ans_file, line, output):
    q_uid = line['q_uid']
    letters = ['A', 'B', 'C', 'D', 'E']

    pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
    try:
        if len(pred_answer) >= 1:
            pred_answer = pred_answer[0].strip()
            pred_answer = pred_answer.strip('()')
            pred_idx = letters.index(pred_answer)
        else:
            print(f'The video "{q_uid}" output "{output}" is not in the expected format')
            pred_idx = -1  # or some default value
    except Exception as e:
        print(f"Error processing output for q_uid {q_uid}: {str(e)}")
        pred_idx = -1  # or some default value

    ans_file.write(f'{q_uid}, {pred_idx}\n')

In [None]:

sys.argv = ['ipykernel_launcher.py', 
            '--model-path', 'lmms-lab/LLaVA-Video-72B-Qwen2',
            '--model-name', 'llava_qwen',
            '--video-folder', 'egoschema/videos',
            '--question-file', 'egoschema/questions.json']

parser = argparse.ArgumentParser()
parser.add_argument('--model-path', required=True)
parser.add_argument('--video-folder', required=True)
parser.add_argument('--question-file', required=True)
parser.add_argument('--answer-file', required=True)
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--device", type=str, required=False, default='cuda:0')
parser.add_argument("--batch-size", type=int, default=1)
parser.add_argument("--num-workers", type=int, default=8)
args = parser.parse_args()

In [None]:
run_inference(args)