In [17]:
import cv2
import os
import time
import torch
import torchvision.transforms as transforms
from PIL import Image
import clip
import numpy as np
import replicate

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Create a directory to store the images
image_directory = "captured_images"
os.makedirs(image_directory, exist_ok=True)

# Open the camera
cap = cv2.VideoCapture(0)

def capture_and_store_images():
    image_count = 1
    embeddings = []

    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()

        # Check if frame is valid
        if not ret:
            continue

        # Display the resulting frame (optional)
        cv2.imshow('Camera', frame)

        # Preprocess the frame
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB format
        image_pil = Image.fromarray(image)
        preprocessed_image = preprocess(image_pil).unsqueeze(0).to(device)

        # Encode the image features
        with torch.no_grad():
            image_features = model.encode_image(preprocessed_image)

        # Store the embeddings
        embeddings.append(image_features.cpu().numpy())

        # Save the image to the directory
        image_path = os.path.join(image_directory, f"image{image_count}.jpg")
        cv2.imwrite(image_path, frame)

        # Increment the image count
        image_count += 1

        # Wait for 10 seconds
        time.sleep(10)

        # Stop capturing images if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Convert the embeddings list to a numpy array
    embeddings = np.array(embeddings)

    # Save the embeddings to a numpy file
    np.save("embeddings.npy", embeddings)

    # Release the camera
    cap.release()
    cv2.destroyAllWindows()

# Call the capture_and_store_images() function to start capturing images
capture_and_store_images()

# Load the CLIP model and its preprocessing function
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

text = "a man "  # Replace with your textual description

# Preprocess the text
text_input = clip.tokenize([text]).to(device)

# Load the stored image embeddings
embeddings = np.load("embeddings.npy")

# Calculate similarity between the text and image embeddings
with torch.no_grad():
    text_features = model.encode_text(text_input)
    similarities = torch.nn.functional.cosine_similarity(text_features, torch.from_numpy(embeddings).to(device))

# Retrieve the index of the most similar image
most_similar_index = torch.argmax(similarities).item()

# Assuming you have a directory named "captured_images" containing the captured images
image_directory = "captured_images"

# Get the list of image filenames in the directory
image_filenames = os.listdir(image_directory)

# Check if the most_similar_index is within the range of image_filenames
if most_similar_index < len(image_filenames):
    # Get the path of the most similar image
    most_similar_image_filename = os.path.join(image_directory, image_filenames[most_similar_index])

    # Load and display the most similar image
    most_similar_image = cv2.imread(most_similar_image_filename)
    cv2.imshow("Most Similar Image", most_similar_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Process the image using replicate for captioning
    os.environ['REPLICATE_API_TOKEN'] = 'r8_8qbNrxWbMrzN2ZKDz9nG3bjI2LArp4A2pQ0xF'
    output = replicate.run(
        "rmokady/clip_prefix_caption:9a34a6339872a03f45236f114321fb51fc7aa8269d38ae0ce5334969981e4cd8",
        input={"image": open(most_similar_image_filename, "rb")}
    )

    # Retrieve the caption from the output
    caption = output
    print("image", most_similar_image_filename)
    print("Image Caption:", caption)
else:
    print("No image found.")

image captured_images\image1.jpg
Image Caption: A man with a beard and glasses looking at the camera.
