In [None]:
""" 
r is the region of the slice
list_of_feature is video -> images -> features, rows is frame's features
"""
def svd_reduce_dim(list_of_feature, r = 6):

    u, s, v = np.linalg.svd(list_of_feature, full_matrices=False)
    X = u
    output = X[:, :r]
     
    # a matrix can directly to use maxvol
    return output

"""
piv is select of key_frames index. 
initail_M is svd output.
"""
import numpy as np

from maxvolpy.maxvolpy.maxvol import rect_maxvol

def max_vol_extract_index(initail_M, tol, minK=1, maxK=500):
    # piv, C = rect_maxvol(u, tol = 0.1, minK, maxK)
    piv, C = rect_maxvol(initail_M, tol = 0.1)
    
    print(np.allclose(initail_M, C.dot(initail_M[piv])))
    
    return piv

def reduce_video_frames(vision_model, vision_processor, video_input, max_frames=50):
    
    def auto_select_device():
        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()
            for i in range(num_gpus):
                try:
                    device = torch.device(f"cuda:{i}")
                    with torch.cuda.device(device):
                        pass
                    return device 
                except:
                    continue
        return torch.device("cpu")

    device = auto_select_device()
    
    image_features = []
    
    for input in video_input:
        
        with torch.no_grad():
            inputs = vision_processor(images=input, return_tensors="pt", padding=True).to(device)
            image_feature = vision_model.get_image_features(**inputs)
            image_feature = image_feature[0]

        image_features.append(image_feature.cpu())

    low_dimensions = svd_reduce_dim(image_features, r = 8

    selections = max_vol_extract_index(low_dimensions, tol = 0.30)

    selections.sort()
    
    filtered_frames = selections

    if len(filtered_frames) > 1 and len(filtered_frames) % 2 != 0:
        filtered_frames = filtered_frames[:-1]
    
    selected_pixels = video_input[filtered_frames]
    
    return selected_pixels

def reduce_video_frames1(video_input, max_frames=768):
    """
    Reduce the number of frames in a video tensor to a maximum of max_frames.
    
    Args:
    video_input (torch.Tensor): Input video tensor of shape [num_frames, channels, height, width]
    max_frames (int): Maximum number of frames in the output tensor
    
    Returns:
    torch.Tensor: Video tensor with reduced number of frames
    """
    num_frames, channels, height, width = video_input.shape
    
    if num_frames <= max_frames:
        return video_input
    
    # Calculate indices of frames to keep
    keep_indices = torch.linspace(0, num_frames - 1, max_frames).long()
    
    # Select frames
    reduced_video = video_input[keep_indices]
    
    return reduced_video

In [None]:
import os
import re
import math
import json
import argparse
import warnings
import traceback

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]
class EgoschemaDataset(Dataset):
    video_formats = ['.mp4', '.avi', '.mov', '.mkv']

    def __init__(self, data_folder, data_list):
        self.data_folder = data_folder
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        line = self.data_list[idx]
        q_uid = line['q_uid']  # Ensure this is correctly accessed from the data

        for fmt in self.video_formats:
            temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
            if os.path.exists(temp_path):
                video_path = temp_path
                break

        video_input = {"type": "video", "video": video_path, "max_pixels": 360 * 420, "fps": 3.0}

        question = line['question']
        a0, a1, a2, a3, a4 = line['option 0'], line['option 1'], line['option 2'], line['option 3'], line['option 4']
        instruct = f'Question: {question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'

        return {
            'q_uid': q_uid,
            'video': video_input, 
            'instruct': instruct,
        }

def build_egoschema_eval(args):
    questions = json.load(open(args.question_file, "r"))
    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
    dataset = EgoschemaDataset(args.video_folder, questions)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
    return dataloader

def convert_dict(orig_dict):
    # Initialize a new dictionary
    new_dict = {}
    
    # Process each item
    new_dict['type'] = orig_dict['type'][0]
    new_dict['video'] = orig_dict['video'][0]  # replace with the actual path if needed
    new_dict['max_pixels'] = 360 * 420  # calculated from the context or could use the original value
    new_dict['fps'] = float(orig_dict['fps'][0])  # Convert tensor to float
    
    return new_dict

In [None]:
def run_inference(args):
    
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        args.model_path,
        torch_dtype="auto",
        attn_implementation="flash_attention_2",
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)

    # 7. define encoder --- torch_dtype="auto", attn_implementation="flash_attention_2",
    vision_model_path = "openai/clip-vit-large-patch14-336"
    vision_model = CLIPModel.from_pretrained(vision_model_path, device_map="auto")
    vision_processor = AutoProcessor.from_pretrained(vision_model_path)

    answer_file = os.path.expanduser(args.answer_file)
    os.makedirs(os.path.dirname(answer_file), exist_ok=True)
    ans_file = open(answer_file, "w")

    frames = []

    val_loader = build_egoschema_eval(args)

    for batch in tqdm(val_loader):

        # q_uid: 004a7f7e-9e83-431f-bc98-859cf9024e93
        
        # instruct
        # Question: What are the main ingredients and tools used during the video, and how do they contribute to the goal of the activity?
        # Options:
        # (A) The primary ingredients utilized in the demonstration video include peas, water, salt, along with a knife.
        # (B) The main ingredients used in the video are peas, water, salt, and a fork.
        # (C) The main ingredients used in the video are peas, water, and salt. the main tools used are a measuring cup, a pan, and a spoon.
        # (D) In the video, the main ingredients utilized are peas, water, salt, and a plate to hold them.
        # (E) In the instructional video, the primary ingredients employed are peas, water, salt, and a simple bowl for preparation.
        # Answer with the option's letter from the given choices directly and only give the best option.
        
        q_uid = batch['q_uid'][0] if isinstance(batch['q_uid'], list) else batch['q_uid']
        
        instruct = batch['instruct'][0] if isinstance(batch['instruct'], list) else batch['instruct']

        video = convert_dict(batch['video'])
        
        # Note: We're not using 'video' key as it's not present in the batch
        messages = [
                {
                    "role": "user",
                    "content": [
                        video,
                        {"type": "text", "text": instruct},
                    ],
                }
            ]

        try:
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            
            image_inputs, video_inputs = process_vision_info(messages)

            print(len(video_inputs[0]))
            
            # reduce 
            reduced_video_inputs = [reduce_video_frames(vision_model, vision_processor, vi) for vi in video_inputs]

            reduced_video_inputs = reduce_video_frames1(reduced_video_inputs, 180)

            print(len(reduced_video_inputs[0]))

            frames.append(len(reduced_video_inputs[0]))
            
            inputs = processor(
                text=[text],
                images = image_inputs,
                videos = reduced_video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to(model.device)

            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
            output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

            # Prepare a dictionary for egoschema_dump
            result = {
                'q_uid': q_uid,
                'instruct': instruct
            }
            egoschema_dump(ans_file, result, output)
        except Exception as e:
            print(f"Error processing q_uid {q_uid}: {str(e)}")
            # Write a default answer or skip this question
            ans_file.write(f'{q_uid}, -1\n')

    ans_file.close()

    # 指定JSON文件的保存路径
    file_path = 'frames'
    
    # 将数据写入JSON文件
    with open(file_path, 'w') as json_file:
        json.dump(frames, json_file)

def egoschema_dump(ans_file, line, output):
    q_uid = line['q_uid']
    letters = ['A', 'B', 'C', 'D', 'E']

    pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
    try:
        if len(pred_answer) >= 1:
            pred_answer = pred_answer[0].strip()
            pred_answer = pred_answer.strip('()')
            pred_idx = letters.index(pred_answer)
        else:
            print(f'The video "{q_uid}" output "{output}" is not in the expected format')
            pred_idx = -1  # or some default value
    except Exception as e:
        print(f"Error processing output for q_uid {q_uid}: {str(e)}")
        pred_idx = -1  # or some default value

    ans_file.write(f'{q_uid}, {pred_idx}\n')

In [None]:
import sys
import argparse

# 模拟命令行参数
sys.argv = ['ipykernel_launcher.py', 
            '--model-path', 'Qwen/Qwen2-VL-2B-Instruct',
            '--video-folder', 'egoschema/videos',
            '--question-file', 'egoschema/questions.json',
            '--answer-file', 'egoschema/answer-2B-new-set.json']

parser = argparse.ArgumentParser()
parser.add_argument('--model-path', required=True)
parser.add_argument('--video-folder', required=True)
parser.add_argument('--question-file', required=True)
parser.add_argument('--answer-file', required=True)
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--device", type=str, required=False, default='cuda:0')
parser.add_argument("--batch-size", type=int, default=1)
parser.add_argument("--num-workers", type=int, default=8)
args = parser.parse_args()