# Video QA with VideoBLIP

Load the clip for an arbitrary action from Ego4d.

In [None]:
import json

from pytorchvideo.data.video import VideoPathHandler

with open("../../ego4d/v2/annotations/fho_main.json") as f:
    fho_main = json.load(f)

video = fho_main["videos"][20]
interval = video["annotated_intervals"][2]
action = interval["narrated_actions"][4]

print(f'video_uid: {video["video_uid"]}')
print(f'start_sec: {action["start_sec"]}')
print(f'end_sec: {action["end_sec"]}')
print(f'narration_text: {action["narration_text"]}')

video_path_handler = VideoPathHandler()
video = video_path_handler.video_from_path(
    f"../../ego4d/v2/full_scale/{video['video_uid']}.mp4"
)
clip = video.get_clip(action["start_sec"], action["end_sec"])


Load `ego-video-blip2-opt-2.7b-subsample-8`.

In [None]:
import torch
from transformers import Blip2Processor

from eilev.model import VideoBlipForConditionalGeneration, process

device = "cuda" if torch.cuda.is_available() else "cpu"
pretrained = "../../checkpoints/ego-video-blip2/ego-video-blip2-opt-2.7b-subsample-8"
processor = Blip2Processor.from_pretrained(pretrained)
model = VideoBlipForConditionalGeneration.from_pretrained(pretrained).to(device)


Perform Video QA without frame subsampling.

In [None]:
prompt = "Question: what is the camera wearer doing? Answer:"
inputs = process(processor, images=clip["video"], text=prompt, return_tensors="pt").to(
    device
)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
with torch.no_grad():
    generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")


Now with temporal frame subsampling.

In [None]:
# sample a frame every 30 frames, i.e., 1 FPS
frames = clip["video"][:, ::30, ...]

inputs = process(processor, images=frames, text=prompt, return_tensors="pt").to(device)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
with torch.no_grad():
    generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")
del model
torch.cuda.empty_cache()


Load `ego-video-blip2-flan-t5-xl-subsample-8`.

In [None]:
pretrained = "../../checkpoints/ego-video-blip2/ego-video-blip2-flan-t5-xl-subsample-8"
processor = Blip2Processor.from_pretrained(pretrained)
model = VideoBlipForConditionalGeneration.from_pretrained(pretrained).to(device)


Perform Video QA without frame subsampling.

In [None]:
inputs = process(processor, images=clip["video"], text=prompt, return_tensors="pt").to(
    device
)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
with torch.no_grad():
    generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")


Now with uniform temporal frame subsampling.

In [None]:
# sample a frame every 30 frames, i.e., 1 FPS
frames = clip["video"][:, ::30, ...]

inputs = process(processor, images=frames, text=prompt, return_tensors="pt").to(device)
print(f"inputs: {({k: v.size() for k, v in inputs.items()})}")
with torch.no_grad():
    generated_ids = model.generate(**inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[
    0
].strip()
print(f"generated_text: {generated_text}")
del model
torch.cuda.empty_cache()
