In [1]:
import json
load_path = "/content/frame_descriptions.json"

with open(load_path, "r", encoding="utf-8") as f:
    frame_descriptions = json.load(f)

print(f"‚úÖ Loaded {len(frame_descriptions)} frame descriptions from:", load_path)
print("Example entry:", frame_descriptions[0])

‚úÖ Loaded 37 frame descriptions from: /content/frame_descriptions.json
Example entry: {'frame_index': 0, 'description': "In this office environment, I can identify several objects that you might retrieve easily:\n\n1. Laptops: There are two laptops visible on the desk. One is open with a white screen, and the other is closed.\n\n2. Keyboards: There are two keyboards on the desk, one in front of each laptop.\n\n3. Monitors: There are two computer monitors on the desk, one for each laptop.\n\n4. Computer mice: There are two computer mice on the desk, one near each keyboard.\n\n5. Papers: There are some papers visible on the desk, likely for reference or note-taking.\n\n6. Black bag: A black bag is visible on the floor to the right of the desk.\n\n7. Black chair: A black chair is positioned in front of the desk.\n\n8. Black backpack: A black backpack is visible on the floor to the right of the desk.\n\n9. Green plant: There's a green plant on the desk, adding a touch of nature to the wor

In [1]:
import requests
from PIL import Image
from io import BytesIO
from transformers import AutoProcessor, AutoModelForImageTextToText, TextStreamer

from moviepy.editor import VideoFileClip

video_path = "/content/IMG_5599.MOV"   # ‚Üê correct file path

try:
    video_clip = VideoFileClip(video_path)
    print("üéâ Loaded successfully!")
    print("Duration:", video_clip.duration, "seconds")
    print("FPS:", video_clip.fps)
except Exception as e:
    print("‚ùå ERROR:", e)

üéâ Loaded successfully!
Duration: 37.69 seconds
FPS: 29.97002997002997


In [2]:
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
import json

# Load vision model
model_id = "LiquidAI/LFM2-VL-1.6B"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16",
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Sample frames
sampled_frames = []
print(f"Sampling frames from video of duration: {video_clip.duration} seconds...")

for t in range(0, int(video_clip.duration), 1):
    try:
        frame_array = video_clip.get_frame(t)
        pil_image = Image.fromarray(frame_array)
        sampled_frames.append((t, pil_image))
    except Exception as e:
        print(f"Could not extract frame at time {t}s: {e}")

print(f"Total frames sampled: {len(sampled_frames)}")

# Extract simple descriptions
frame_descriptions = []

for i, (timestamp, frame) in enumerate(sampled_frames):
    print(f"Processing frame {i+1}/{len(sampled_frames)}...")

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": frame},
                {"type": "text", "text": "List every object you see that someone might need to find later (phone, wallet, keys, glasses, bag, book, laptop, remote, etc.). For each object, describe its exact location. Be specific about position (left/right/center, on table/floor/couch, etc.)."},
            ],
        },
    ]

    # FIXED: Vision params go HERE in apply_chat_template
    inputs = processor.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
        tokenize=True,
        # Vision-specific parameters
        min_image_tokens=64,
        max_image_tokens=256,
        do_image_splitting=True
    ).to(model.device)

    # FIXED: Only generation params here
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False  # Greedy decoding for consistent output
    )

    generated_text = processor.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    frame_descriptions.append({
        "timestamp": timestamp,
        "description": generated_text
    })

    print(f"Frame {i+1}: {generated_text}\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

`torch_dtype` is deprecated! Use `dtype` instead!


Sampling frames from video of duration: 37.69 seconds...
Total frames sampled: 37
Processing frame 1/37...
Frame 1: Here's a list of objects someone might need to find later, along with their exact locations:

1. Phone: Not visible in the image, but likely on the desk or in a nearby drawer.

2. Wallet: Not visible in the image, but likely in a wallet or purse on the desk or floor.

3. Keys: Not visible in the image, but likely in a key holder or on the desk.

4. Glasses: Not visible in the image, but likely in a desk organizer or on the desk.

5. Bag: Not visible in the image, but likely in a desk organizer or on the desk.

6. Laptop: On the desk in the foreground, with its screen facing the camera.

7. Remote: Not visible in the image, but likely on the desk or in a nearby drawer.

8. Papers: Scattered on the desk, possibly in a folder or on a notepad.

9. Computer mouse: On the desk, next to the keyboard.

10. Computer keyboard: On the desk, next to the mouse.

11. Computer monitor: 

In [3]:
frame_descriptions = frame_descriptions[:36]
# Save descriptions
with open('vision_observations.json', 'w') as f:
    json.dump(frame_descriptions, f, indent=2)

print("‚úì Saved vision observations to vision_observations.json")

‚úì Saved vision observations to vision_observations.json


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import re

# Load text model (LFM2-2.6B)
model_id = "LiquidAI/LFM2-2.6B"
text_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16"
)
text_tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load vision observations
with open('vision_observations.json', 'r') as f:
    observations = json.load(f)

# Calculate video length (last frame timestamp)
video_length = max(obs['timestamp'] for obs in observations)

def format_time_ago(seconds):
    """Convert seconds to human-friendly 'X minutes ago' or 'X seconds ago'"""
    if seconds < 60:
        return f"{seconds} seconds ago"
    elif seconds < 120:
        return "about 1 minute ago"
    else:
        minutes = seconds // 60
        return f"about {minutes} minutes ago"

def find_object(query):
    """Find an object using RAG approach with timestamps"""

    # Extract object name from query
    object_name = query.lower()
    for phrase in ["where is my ", "where did i put my ", "where's my ", "find my ", "where are my "]:
        object_name = object_name.replace(phrase, "")
    object_name = object_name.replace("?", "").strip()

    # Build context from ALL observations that mention the object
    relevant_observations = []
    for obs in observations:
        desc = obs['description'].lower()
        # Check if this observation mentions the object
        if object_name in desc or object_name[:-1] in desc:  # handles singular/plural
            # Calculate how long ago this was
            time_ago_seconds = video_length - obs['timestamp']
            time_ago_text = format_time_ago(time_ago_seconds)

            relevant_observations.append({
                'timestamp': obs['timestamp'],
                'time_ago_seconds': time_ago_seconds,
                'time_ago_text': time_ago_text,
                'description': obs['description']
            })

    if not relevant_observations:
        return {
            "object": object_name,
            "found": False,
            "response": f"I couldn't find your {object_name} in any of the camera footage."
        }

    # Sort by most recent (closest to end of video)
    relevant_observations.sort(key=lambda x: x['time_ago_seconds'])

    # Get the most recent observation
    most_recent = relevant_observations[0]

    # Build context with timestamps
    context_parts = []
    for i, obs in enumerate(relevant_observations[:5]):  # Max 5 most relevant
        context_parts.append(
            f"Observation from {obs['time_ago_text']}: {obs['description']}"
        )
    context = "\n\n".join(context_parts)

    # Simple prompt
    conversation = [
        {
            "role": "system",
            "content": "You are a helpful assistant that answers questions about object locations. Be concise and specific."
        },
        {
            "role": "user",
            "content": f"""Question: Where is the {object_name}?

Recent camera observations:
{context}

Based on the most recent observation, tell me in ONE sentence where the {object_name} is located."""
        }
    ]

    # Generate
    input_ids = text_tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True
    ).to(text_model.device)

    outputs = text_model.generate(
        input_ids,
        do_sample=True,
        temperature=0.3,
        max_new_tokens=100,
        eos_token_id=text_tokenizer.eos_token_id
    )

    response = text_tokenizer.decode(
        outputs[0][input_ids.shape[1]:],
        skip_special_tokens=True
    ).strip()

    # Clean up response - take first sentence
    response = response.split('.')[0] + '.'
    response = response.replace('\n', ' ')

    # Add time information to response
    final_response = f"{response} (Last seen {most_recent['time_ago_text']})"

    return {
        "object": object_name,
        "found": True,
        "location": response,
        "last_seen": most_recent['time_ago_text'],
        "response": final_response,
        "num_observations": len(relevant_observations)
    }

# Test it
print("\n=== Object Finder with Timestamps ===\n")

queries = [
    "Where is my wallet?",
    "Where did I put my keys?",
    "Where is my phone?",
    "Where are my glasses?",
    "Where is my bag?"
]

for query in queries:
    result = find_object(query)

    print(f"Q: {query}")
    if result['found']:
        print(f"A: {result['response']}")
        print(f"   (Seen in {result['num_observations']} different moments)\n")
    else:
        print(f"A: {result['response']}\n")

# Voice-friendly version
def speak_result(result):
    """Convert to natural speech for blind users"""
    if result['found']:
        return f"Your {result['object']} is {result['location']}. I last saw it {result['last_seen']}."
    else:
        return f"I couldn't find your {result['object']} in the footage."

print("\n=== Voice Mode (Natural Speech) ===\n")

for query in queries:
    result = find_object(query)
    speech = speak_result(result)
    print(f"Q: {query}")
    print(f"üîä {speech}\n")


=== Object Finder with Timestamps ===

Q: Where is my wallet?
A: The wallet is located on the left side of the room on a white table. (Last seen 0 seconds ago)
   (Seen in 35 different moments)

Q: Where did I put my keys?
A: According to the most recent observation, the keys are located on a white table on the left side of the room. (Last seen 0 seconds ago)
   (Seen in 35 different moments)

Q: Where is my phone?
A: The phone is located on the left side of the room, on a white table. (Last seen 0 seconds ago)
   (Seen in 35 different moments)

Q: Where are my glasses?
A: The glasses are located on a white table in the center of the room. (Last seen 0 seconds ago)
   (Seen in 34 different moments)

Q: Where is my bag?
A: The bag is located on the floor under the table, to the left of the image. (Last seen 0 seconds ago)
   (Seen in 35 different moments)


=== Voice Mode (Natural Speech) ===

Q: Where is my wallet?
üîä Your wallet is The wallet is located on the left side of the room

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

# --- Load text model (LFM2-2.6B) ---
model_id = "LiquidAI/LFM2-2.6B"
text_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16"
)
text_tokenizer = AutoTokenizer.from_pretrained(model_id)

# --- Load vision observations (frames, in time order) ---
with open('vision_observations.json', 'r') as f:
    observations = json.load(f)

TOTAL_FRAMES = len(observations)  # frames 0 .. TOTAL_FRAMES-1


def format_time_ago(seconds: int) -> str:
    """Convert seconds to human-friendly 'X seconds ago' / 'about Y minutes ago'."""
    if seconds <= 0:
        return "just now"
    if seconds < 60:
        return f"{int(seconds)} seconds ago"
    if seconds < 120:
        return "about 1 minute ago"
    minutes = int(seconds // 60)
    return f"about {minutes} minutes ago"


def clean_object_name(query: str) -> str:
    """Strip 'where is my', question marks, etc."""
    object_name = query.lower()
    for phrase in [
        "where is my ",
        "where did i put my ",
        "where's my ",
        "find my ",
        "where are my "
    ]:
        object_name = object_name.replace(phrase, "")
    object_name = object_name.replace("?", "").strip()
    return object_name


def find_object(query: str):
    """Find an object using ONLY frame indices (no timestamp shenanigans)."""

    object_name = clean_object_name(query)

    relevant_observations = []

    # Iterate with frame index so we know EXACTLY when this was in the video
    for frame_idx, obs in enumerate(observations):
        desc = obs["description"].lower()

        # Skip negatives like "not visible", "not found", "not seen"
        if (object_name in desc or object_name[:-1] in desc) and \
           "not visible" not in desc and \
           "not found" not in desc and \
           "not seen" not in desc:

            frames_ago = (TOTAL_FRAMES - 1) - frame_idx       # 0 = last frame
            seconds_ago = frames_ago                          # 1 frame per second
            time_ago_text = format_time_ago(seconds_ago)

            relevant_observations.append({
                "frame_index": frame_idx,
                "seconds_ago": seconds_ago,
                "time_ago_text": time_ago_text,
                "description": obs["description"],
            })

    if not relevant_observations:
        return {
            "object": object_name,
            "found": False,
            "response": f"I couldn't find your {object_name} in any of the camera footage."
        }

    # Sort by frame index descending ‚Üí latest frame where object was ACTUALLY seen
    relevant_observations.sort(key=lambda x: x["frame_index"], reverse=True)

    most_recent = relevant_observations[0]

    # Recompute time_ago_text in case you ever change TOTAL_FRAMES or sampling rate
    frames_ago = (TOTAL_FRAMES - 1) - most_recent["frame_index"]
    seconds_ago = frames_ago
    last_seen_text = format_time_ago(seconds_ago)

    # Build context using top few most recent sightings
    context_parts = []
    for obs in relevant_observations[:5]:
        context_parts.append(
            f"{obs['time_ago_text']}: {obs['description']}"
        )
    context = "\n\n".join(context_parts)

    # Prompt to LFM2-2.6B
    conversation = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant that answers questions about "
                "where objects are located based on camera observations. "
                "Be concise and specific."
            ),
        },
        {
            "role": "user",
            "content": f"""Question: Where is the {object_name}?

Most recent camera observations (newest first):
{context}

Based on the most recent observation, tell me in ONE clear sentence where the {object_name} is located."""
        },
    ]

    # Generate with chat template
    input_ids = text_tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(text_model.device)

    outputs = text_model.generate(
        input_ids,
        do_sample=True,
        temperature=0.3,
        max_new_tokens=100,
        eos_token_id=text_tokenizer.eos_token_id,
    )

    raw = text_tokenizer.decode(
        outputs[0][input_ids.shape[1]:],
        skip_special_tokens=True,
    ).strip()

    # Take first sentence, clean up
    response = raw.split(".")[0].strip() + "."
    response = response.replace("\n", " ")

    # Strip dumb prefixes if they appear
    for prefix in [
        "According to the most recent observation, ",
        "According to ",
        "The most recent observation shows that ",
        "Based on the most recent observation, ",
        "likely "
    ]:
        if response.lower().startswith(prefix.lower()):
            response = response[len(prefix):]
            response = response[0].upper() + response[1:]

    # Ensure it starts nicely
    if not (response.startswith("The ") or response.startswith("Your ")):
        response = f"The {object_name} is {response[0].lower()}{response[1:]}"

    final_response = f"{response} (Last seen {last_seen_text})"

    return {
        "object": object_name,
        "found": True,
        "location": response,
        "last_seen": last_seen_text,
        "last_frame": most_recent["frame_index"] + 1,  # human-readable frame number
        "response": final_response,
        "num_observations": len(relevant_observations),
    }


def speak_result(result):
    """Natural speech for blind users."""
    if result["found"]:
        return f"{result['location']} I last saw it {result['last_seen']}."
    else:
        return f"I couldn't find your {result['object']} in the footage."


# --- Quick test harness (same as before, but now with correct times) ---
print("\n=== Object Finder with Correct Frame-Based Timing ===\n")

queries = [
    "Where is my wallet?",
    "Where did I put my keys?",
    "Where is my phone?",
    "Where are my glasses?",
    "Where is my bag?",
]

for query in queries:
    result = find_object(query)
    print(f"Q: {query}")
    if result["found"]:
        print(f"A: {result['response']}")
        print(
            f"   (Last seen in frame {result['last_frame']}, "
            f"seen in {result['num_observations']} different moments)\n"
        )
    else:
        print(f"A: {result['response']}\n")

print("\n=== Voice Mode (Natural Speech) ===\n")
for query in queries:
    result = find_object(query)
    print(f"Q: {query}")
    print(f"üîä {speak_result(result)}\n")


=== Object Finder with Correct Frame-Based Timing ===

Q: Where is my wallet?
A: The black wallet is located on the floor to the left of the black backpack, near the center of the image. (Last seen 4 seconds ago)
   (Last seen in frame 32, seen in 23 different moments)

Q: Where did I put my keys?
A: The keys are on the floor to the right of the wallet. (Last seen 4 seconds ago)
   (Last seen in frame 32, seen in 23 different moments)

Q: Where is my phone?
A: The phone is located on the desk of the person in the red shirt, slightly to the left. (Last seen 4 seconds ago)
   (Last seen in frame 32, seen in 23 different moments)

Q: Where are my glasses?
A: The glasses are located on the desk of the person in the blue shirt, near the center of the room. (Last seen 4 seconds ago)
   (Last seen in frame 32, seen in 22 different moments)

Q: Where is my bag?
A: The black bag is located on the floor in front of the couch, slightly to the left of the center. (Last seen 4 seconds ago)
   (Las

In [8]:
frame_descriptions

[{'timestamp': 0,
  'description': "Here's a list of objects someone might need to find later, along with their exact locations:\n\n1. Phone: Not visible in the image, but likely on the desk or in a nearby drawer.\n\n2. Wallet: Not visible in the image, but likely in a wallet or purse on the desk or floor.\n\n3. Keys: Not visible in the image, but likely in a key holder or on the desk.\n\n4. Glasses: Not visible in the image, but likely in a desk organizer or on the desk.\n\n5. Bag: Not visible in the image, but likely in a desk organizer or on the desk.\n\n6. Laptop: On the desk in the foreground, with its screen facing the camera.\n\n7. Remote: Not visible in the image, but likely on the desk or in a nearby drawer.\n\n8. Papers: Scattered on the desk, possibly in a folder or on a notepad.\n\n9. Computer mouse: On the desk, next to the keyboard.\n\n10. Computer keyboard: On the desk, next to the mouse.\n\n11. Computer monitor: On the desk, to the left of the keyboard.\n\n12. Computer 