In [1]:
from transformers import AutoTokenizer, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch


model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    'hamidra/pie-llava-augmented',
    token="",
    torch_dtype=torch.float16,
    device_map="auto",
)

In [2]:
from transformers import LlavaNextVideoProcessor
processor = LlavaNextVideoProcessor.from_pretrained("/mnt/esperanto/et/intern/hamidreza/PIE/hamidreza_files/checkpoints/run_aug_dataset")
processor.tokenizer.padding_side = "right"

In [3]:
MAX_LENGTH = 128

In [4]:
def run_inference(video_clip, model):

    conversation = [
      {
          "role": "user",
          "content": [
              {"type": "text", "text": "Above are 16 frames of a driving scenario captured by the ego vehicle camera based on the video taken, in which the pedestrian of interest is located with a red bounding box "},
            #   {"type": "text", "text": "The normalized bounding boxes of the pedestrian in these consequetive 16 frames are provided as follows in 8 lists each containing 4 elements, with format of [x1, y1, x2, y2] in which x1 and y1 are coordinates of top left corner and x2 and y2 are coordinates of the bottom right corner of the bounding box: "},
            #   {"type": "text", "text": ped_bbox + ". "},
            #   {"type": "text", "text": ped_look + ". "},
            #   {"type": "text", "text": ped_action + ". "},
              {"type": "text", "text": "Using these frames, provided context and pedestrian bounding box, and your reasoning, answer the following question only with ‘Yes’ or ‘No’. You may use your knowledge if needed. DO NOT EXPLAIN your reasoning, be confident.\nQuestion: Does the indicated pedestrian intend to cross the intersection in future frames of this video?"},
              {"type": "video"},
              ]
      },
]

    # Set add_generation_prompt to add the "ASSISTANT: " at the end
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    batch = processor(
        text=prompt,
        videos=None, # we have a processed video, passing it again to processor causes errors
        return_tensors="pt"
    ).to(model.device)
    video_clip = video_clip.to(model.device)

    out = model.generate(**batch, pixel_values_videos=video_clip, max_length=MAX_LENGTH, do_sample=True, temperature=1)
    generated_text = processor.batch_decode(out, skip_special_tokens=True)
    return generated_text

In [24]:
def run_inference_2(video_clip, context, model):

    conversation = [
      {
          "role": "user",
          "content": [
              {"type": "text", "text": context},
              {"type": "text", "text": "Using these frames, provided context, and your reasoning, answer the following question only with ‘Yes’ or ‘No’. You may use your knowledge if needed. DO NOT EXPLAIN your reasoning, be confident.\nQuestion: Does the indicated pedestrian intend to cross the intersection in future frames of this video?"},
              {"type": "video"},
              ],
      },
]

    # Set add_generation_prompt to add the "ASSISTANT: " at the end
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    batch = processor(
        text=prompt,
        videos=None, # we have a processed video, passing it again to processor causes errors
        return_tensors="pt"
    ).to(model.device)
    video_clip = video_clip.to(model.device)

    out = model.generate(**batch, pixel_values_videos=video_clip, max_length=MAX_LENGTH, do_sample=True)
    generated_text = processor.batch_decode(out, skip_special_tokens=True)
    return generated_text

In [7]:
!pwd

/mnt/esperanto/et/intern/hamidreza/PIE


In [3]:
from datasets import load_from_disk
from datasets import load_dataset, concatenate_datasets

val_dataset_p = load_from_disk('hamidreza_files/hf_datasets_16frames/val_p/original')
val_dataset_n = load_from_disk('hamidreza_files/hf_datasets_16frames/val_n/original')
val_dataset = concatenate_datasets([val_dataset_p, val_dataset_n]).with_format("torch")


In [6]:
inf_example = val_dataset[0]
inf_example["pixel_values_videos"].shape

torch.Size([1, 16, 3, 336, 336])

In [7]:
print(processor.batch_decode(inf_example["input_ids"])[0].split(' ')[-2])

Yes


In [6]:
from tqdm import tqdm

def unsqueeze_pixel_values(example):
    for k in example.keys():
        if example[k].shape[0] == 1:
            continue
        else:
            example[k] = example[k].unsqueeze(0)
    return example

val_dataset = val_dataset.map(unsqueeze_pixel_values)

Map: 100%|██████████| 243/243 [01:22<00:00,  2.96 examples/s]


In [22]:
# xp = load_from_disk('hamidreza_files/hf_datasets_16frames/test_p/original')
# xn = load_from_disk('hamidreza_files/hf_datasets_16frames/test_n/original')
# x = concatenate_datasets([xp, xn]).with_format("torch")

In [13]:
old_model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVa-NeXT-Video-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto",
)

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


In [7]:
# prev_model = LlavaNextVideoForConditionalGeneration.from_pretrained(
#     'hamidra/pie-llava-2',
#     token="",
#     torch_dtype=torch.float16,
#     device_map="auto",
# )

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]


In [8]:
from tqdm import tqdm
model = model.to("cuda")
ft_model_ans_list = []

for i in tqdm(range(len(val_dataset))):
    gt = processor.batch_decode(val_dataset[i]["input_ids"])[0].split(' ')[-2]
    pixel_vals = val_dataset[i]["pixel_values_videos"]
    gen_ans = run_inference(pixel_vals, model)
    bin_ans = gen_ans[0].strip('').split('ASSISTANT:')[1]
    ft_model_ans_list.append([bin_ans, gt])


  0%|          | 0/243 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 243/243 [02:57<00:00,  1.37it/s]


In [16]:
import json

def get_llavaNextVideo_perf(pred_list, ped_dir):

    with open(ped_dir, 'r') as json_file:
        ped_intention_action_dict = json.load(json_file)
    print(len(pred_list))
    print(len(ped_intention_action_dict))
    count = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    idx = 0
    for k, v in ped_intention_action_dict.items():
        gt_intention_prob = ped_intention_action_dict[k][0]
        gt_intention_action = ped_intention_action_dict[k][1]

        # pred_prob = v[1]
        pred_action = pred_list[idx].lower()
        idx += 1


        # true positive
        if 'yes' in pred_action and int(gt_intention_action) == 1:
            tp += 1
            count += 1
        
        # false positive
        if 'yes' in pred_action and int(gt_intention_action) == 0:
            fp += 1
        
        # true negative
        if 'no' in pred_action and int(gt_intention_action) == 0:
            tn += 1
            count += 1

        # false negative
        if 'no' in pred_action and int(gt_intention_action) == 1:
            fn += 1
        
        
    # print(tp)
    # print(fp)
    # print(tn)
    # print(fn)
    # print(tp + fp + tn + fn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1 = 2*precision*recall/(precision + recall)
    print(precision)
    print(recall)
    print('F1 score: ' + str(f1))
    print(count/len(pred_list))

In [10]:
import json

def get_llavaNextVideo_perf_2(pred_list):


    print(len(pred_list))

    count = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    idx = 0
    for x in pred_list:
        llm_answer = x[0].lower()
        gt_answer = x[1].lower()
        idx += 1


        # true positive
        if 'yes' in llm_answer and 'yes' in gt_answer:
            tp += 1
            count += 1
        
        # false positive
        if 'yes' in llm_answer and 'no' in gt_answer:
            fp += 1
        
        # true negative
        if 'no' in llm_answer and 'no' in gt_answer:
            tn += 1
            count += 1

        # false negative
        if 'no' in llm_answer and 'yes' in gt_answer:
            fn += 1
        
        
    # print(tp)
    # print(fp)
    # print(tn)
    # print(fn)
    # print(tp + fp + tn + fn)
    print(tp)
    print(fp)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1 = 2*precision*recall/(precision + recall)
    print(precision)
    print(recall)
    print('F1 score: ' + str(f1))
    print(count/len(pred_list))

In [11]:
# tst = ['yes' for i in range(len(ft_model_ans_list))]
get_llavaNextVideo_perf_2(ft_model_ans_list)

243
155
26
0.856353591160221
0.856353591160221
F1 score: 0.856353591160221
0.7860082304526749


In [None]:
# base llava-next (no ft): acc: 64.6 (tmp: 1)
# ft with original data: acc: 67.07 (tmp: 1)
# ft with augmentation datset: acc:  75.9(tmp: 1)