llava hugging face:

In [8]:
import cv2
import os
import time
import requests
import torch
import warnings
import logging
from tqdm import tqdm
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig

import warnings

warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`:None for open-end generation.")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Video file path
# video_path = 'data/videos_to_process/november_leaves.mp4'
video_path = 'data/videos_to_process/hot_air_balloons.mp4'

# Output video path
output_video_path = 'data/output_captioned_video_hot_air_balloons.mp4'

# Maximum frames to process (set to None to process entire video)
max_frames = None  # Set this to the desired number of frames to process

# Load processor
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

# Specify quantization config to load model in 4-bit format
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model with 4-bit quantization and Flash Attention 2
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_flash_attention_2=True,
    device_map="auto"
)
from IPython.display import clear_output

def generate_caption(frame):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What is shown in this image?"},
            ],
        }
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    
    output = model.generate(**inputs, max_new_tokens=100)
    caption = processor.decode(output[0], skip_special_tokens=True).replace('[INST]', '').replace('[/INST]', '').strip()
    caption = caption.replace("What is shown in this image?", "").strip()
    
    # Clear previous captions
    clear_output(wait=True)
    
    # Print new caption
    print(f"Caption received: {caption}")
    
    return caption

def overlay_caption(frame, caption, position=(50, 50), font_scale=1, color=(255, 255, 255), thickness=2):
    font = cv2.FONT_HERSHEY_SIMPLEX

    # Split the caption into chunks of 15 words each
    words = caption.split()
    lines = [' '.join(words[i:i+15]) for i in range(0, len(words), 15)]

    x, y = position
    for line in lines:
        # Calculate text size for each line
        (text_width, text_height), baseline = cv2.getTextSize(line, font, font_scale, thickness)
        
        # Draw background rectangle for better visibility
        cv2.rectangle(frame, (x - 5, y - text_height - 5), 
                      (x + text_width + 5, y + 5), 
                      (0, 0, 0), -1)
        
        # Put the text on the frame
        cv2.putText(frame, line, (x, y), font, font_scale, color, thickness, cv2.LINE_AA)
        
        # Move y-coordinate down for the next line
        y += text_height + 10  # Space between lines

    return frame

# Load video using OpenCV
cap = cv2.VideoCapture(video_path)

# Get video properties
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

frame_count = 0

start_time = time.time()

# Process each frame of the video
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if max_frames is None else min(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), max_frames)

with tqdm(total=total_frames, desc="Processing Frames", unit="frame") as pbar:
    while True:
        ret, frame = cap.read()
        if not ret or (max_frames is not None and frame_count >= max_frames):
            break

        # Generate caption for the frame
        caption = generate_caption(frame)

        # Overlay the caption on the frame
        frame_with_caption = overlay_caption(frame, caption)

        # Write the frame to the output video
        out.write(frame_with_caption)

        frame_count += 1
        pbar.update(1)

cap.release()
out.release()

end_time = time.time()

total_time = end_time - start_time

logging.info("Video has been created at %s", output_video_path)
logging.info("Total Processing Time: %.2f seconds", total_time)


Processing Frames: 100%|██████████| 464/464 [13:19<00:00,  1.72s/frame]
2024-12-20 00:08:49,036 - INFO - Video has been created at data/output_captioned_video_hot_air_balloons.mp4
2024-12-20 00:08:49,036 - INFO - Total Processing Time: 799.88 seconds


Caption received: The image shows a vintage Volkswagen van parked on a rocky surface with a scenic landscape in the background. The van is red and white, and it appears to be a classic model, possibly from the 1960s or 1970s. Above the van, there are several hot air balloons floating in the sky, suggesting that the location might be a popular spot for ballooning or that the photo was taken during a ballooning event. The


TEST

In [1]:
import time

import requests
import torch
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig

# Load processor
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

# Specify quantization config to load model in 4-bit format
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load model with 4-bit quantization and Flash Attention 2
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_flash_attention_2=True,
    device_map="auto"
)

# Prepare image and text prompt
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")

# Measure inference time
start_time = time.time()

# Autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)

end_time = time.time()

# Decode and print output
print(processor.decode(output[0], skip_special_tokens=True))

# Print the time taken for inference
print(f"Inference Time: {end_time - start_time:.2f} seconds")


2024-12-19 23:38:06.990887: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 23:38:06.999007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734647887.009118  956630 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734647887.012010  956630 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 23:38:07.022004: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You may have used the wrong order for inputs. `images` should be passed before `text`. The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


[INST]  
What is shown in this image? [/INST] The image appears to be a radar chart, also known as a spider chart, which is a type of two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. This particular chart is showing the performance of a model across various metrics, which are likely related to machine learning or artificial intelligence.

The axes represent different metrics, such as "MMM-Vet," "MMM-Vet," "MMM-Vet,"
Inference Time: 2.35 seconds
