In [1]:
import torch
from transformers import LlavaProcessor, LlavaForConditionalGeneration
from PIL import Image
import os

model_id = "llava-hf/llava-1.5-7b-hf"
processor = LlavaProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)

frame_dir = '/content/drive/MyDrive/intern output/Vid 2'
caption_file = os.path.join(frame_dir, 'captions_llava.txt')

with open(caption_file, 'w') as f:
    for filename in sorted(os.listdir(frame_dir)):
        if filename.endswith('.jpg'):
            image_path = os.path.join(frame_dir, filename)
            image = Image.open(image_path).convert("RGB")

            prompt = "<image>\nDescribe this image in detail."
            inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda", torch.float16)

            output = model.generate(**inputs, max_new_tokens=100)
            caption = processor.batch_decode(output, skip_special_tokens=True)[0]

            print(f"{filename}: {caption}")
            f.write(f"{filename}: {caption}\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

frame0.jpg: 
Describe this image in detail.

The image shows a person cutting a piece of food, possibly a cake, using a knife. The person is standing at a table, and the knife is positioned in the middle of the food. The person is wearing a ring on their finger, which is visible as they cut the food. The scene appears to be a close-up of the person and the food, capturing the action of cutting the food.
frame1.jpg: 
Describe this image in detail.

The image shows a person wearing a ring on their finger, using a spoon to scoop up some white substance, possibly ice cream or whipped cream. The person is standing in front of a table, and the spoon is positioned close to the center of the image. The scene appears to be a casual, everyday moment captured in the photo.
frame10.jpg: 
Describe this image in detail.

The image shows a person cutting a piece of food on a cutting board. The person is using a knife to cut the food, which appears to be a piece of bread. The knife is positioned towar

In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

with open("/content/drive/MyDrive/intern output/Vid 2/captions_llava.txt", "r") as file:
    text = file.read()

descriptions = []
for line in text.splitlines():
    if re.match(r'^frame\d+\.jpg:', line) or "describe this image" in line.lower():
        continue
    line = line.strip()
    if len(line) > 20:
        descriptions.append(line)

full_text = " ".join(descriptions)

stop_words = set(stopwords.words("english"))
words = word_tokenize(full_text.lower())
filtered_words = [w for w in words if w.isalpha() and w not in stop_words]

freq = Counter(filtered_words)

top_keywords = [word for word, count in freq.most_common(30) if count > 1]

sentences = sent_tokenize(full_text)

important_sentences = [s for s in sentences if any(kw in s.lower() for kw in top_keywords)]
unique_sentences = list(dict.fromkeys(important_sentences))

summary_paragraph = " ".join(unique_sentences[:8])

print("=== Generalized Auto-Generated Summary ===\n")
print(summary_paragraph)


=== Generalized Auto-Generated Summary ===

The image shows a person cutting a piece of food, possibly a cake, using a knife. The person is standing at a table, and the knife is positioned in the middle of the food. The person is wearing a ring on their finger, which is visible as they cut the food. The scene appears to be a close-up of the person and the food, capturing the action of cutting the food. The image shows a person wearing a ring on their finger, using a spoon to scoop up some white substance, possibly ice cream or whipped cream. The person is standing in front of a table, and the spoon is positioned close to the center of the image. The scene appears to be a casual, everyday moment captured in the photo. The image shows a person cutting a piece of food on a cutting board.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
