In [None]:
#pip install datasets

In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset
import faiss
# Load CSV file
df = pd.read_csv('captions.csv')

embeddings_dataset = Dataset.load_from_disk('embeddings_dataset')
# Load FAISS index
index = faiss.read_index('my_index.faiss')

image_paths = df['image'].tolist()
captions = df['caption'].tolist()

In [53]:
from transformers import AutoTokenizer, AutoModel

# Specify the path to your saved model directory
model_path = "D:\\5th_year\\Project\\project\\models"

# Load the tokenizer from the local directory
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model from the local directory
model = AutoModel.from_pretrained(model_path,from_tf=False, use_safetensors=True)

In [2]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [3]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    #encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    encoded_input = {k: v for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [5]:
# embedding = get_embeddings(df["caption"][0])
# embedding.shape

In [6]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/41 [00:00<?, ?it/s]

Dataset({
    features: ['image', 'caption', 'embeddings'],
    num_rows: 40455
})

In [32]:
query = "dog and cat"
query_embedding = get_embeddings([query]).cpu().detach().numpy()
query_embedding.shape

(1, 768)

In [34]:
def my_get_nearest_examples():
    k=2
    scores = []
    samples = []
    difference = 1
    while difference > 0.6 and k<19:
        del scores, samples
        k= k+1
        scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", query_embedding, k)
        difference = scores[k-1]/scores[0]
        
    return scores,samples


In [35]:
scores, samples = my_get_nearest_examples()

In [36]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [37]:
for _, row in samples_df.iterrows():
    print(f"Caption: {row.caption}")
    print(f"SCORE: {row.scores}")
    print(f"Image: {row.image}")
    print("=" * 50)
    print()

Caption: dogs playing
SCORE: 19.443740844726562
Image: 2714703706_d21c5cb8df.jpg

Caption: Two dogs are playing out in a yard .
SCORE: 19.360389709472656
Image: 3534046564_4f8546e364.jpg

Caption: Two dogs playing on carpet .
SCORE: 19.354385375976562
Image: 2714699748_c9270dd5aa.jpg

Caption: Two dogs groom each other .
SCORE: 19.30721664428711
Image: 3502563726_30d1ce29c8.jpg

Caption: Two dogs are playing together .
SCORE: 19.159297943115234
Image: 2584412512_6767593f24.jpg

Caption: A black and a tan dog .
SCORE: 19.05617904663086
Image: 2466171100_5e60cfcc11.jpg

Caption: Two dogs are playing .
SCORE: 19.024368286132812
Image: 2410399168_1462c422d4.jpg

Caption: Two dogs sharing a toy .
SCORE: 18.76192855834961
Image: 2428797297_7fc3c862db.jpg

Caption: Two dogs are staring at a grey and white cat .
SCORE: 18.735706329345703
Image: 2098646162_e3b3bbf14c.jpg

Caption: Two dogs in a yard .
SCORE: 18.360015869140625
Image: 3151492269_28d8edaa68.jpg

Caption: A dog and a cat are swimm

In [49]:
import pandas as pd
from PIL import Image
import os
import shutil


images_folder = 'images'
retrieved_folder = 'retrieved_faiss'

# Ensure the retrieved_faiss directory exists
os.makedirs(retrieved_folder, exist_ok=True)

for _, row in samples_df.iterrows():
    print(f"Caption: {row.caption}")
    print(f"Score: {row.scores}")
    print(f"Image: {row.image}")
    print("=" * 50)

    # Construct the full path to the image file
    image_path = os.path.join(images_folder, row.image)

    # Open and save the image to the retrieved_faiss folder
    if os.path.exists(image_path):
        image = Image.open(image_path)
        image.show()
        # Construct the path to save the image in the retrieved_faiss folder
        save_path = os.path.join(retrieved_folder, row.image)
        image.save(save_path)
        print(f"Image saved to {save_path}")
    else:
        print(f"Image file {image_path} not found.")
    print()  # Add a blank line for better readability


Caption: dogs playing
Score: 19.443740844726562
Image: 2714703706_d21c5cb8df.jpg
Image saved to retrieved_faiss\2714703706_d21c5cb8df.jpg

Caption: Two dogs are playing out in a yard .
Score: 19.360389709472656
Image: 3534046564_4f8546e364.jpg
Image saved to retrieved_faiss\3534046564_4f8546e364.jpg

Caption: Two dogs playing on carpet .
Score: 19.354385375976562
Image: 2714699748_c9270dd5aa.jpg
Image saved to retrieved_faiss\2714699748_c9270dd5aa.jpg

Caption: Two dogs groom each other .
Score: 19.30721664428711
Image: 3502563726_30d1ce29c8.jpg
Image saved to retrieved_faiss\3502563726_30d1ce29c8.jpg

Caption: Two dogs are playing together .
Score: 19.159297943115234
Image: 2584412512_6767593f24.jpg
Image saved to retrieved_faiss\2584412512_6767593f24.jpg

Caption: A black and a tan dog .
Score: 19.05617904663086
Image: 2466171100_5e60cfcc11.jpg
Image saved to retrieved_faiss\2466171100_5e60cfcc11.jpg

Caption: Two dogs are playing .
Score: 19.024368286132812
Image: 2410399168_1462c42

In [39]:
# import psutil

# # Print the current memory usage
# print(f"Memory usage before training: {psutil.virtual_memory().percent}%")
# # Assuming samples['embeddings'] is your list of embeddings
# # Convert embeddings to float32 if not already done
# retrieved_embeddings = np.array(samples['embeddings']).astype('float32')

# # Check the shape of the array
# print("Shape of retrieved_embeddings:", retrieved_embeddings.shape)

# # Verify no NaNs or infinite values are present
# if np.any(np.isnan(retrieved_embeddings)):
#     raise ValueError("Data contains NaNs")
# if np.any(np.isinf(retrieved_embeddings)):
#     raise ValueError("Data contains infinite values")

# # Initialize FAISS KMeans
# n_clusters = 3 
# d = retrieved_embeddings.shape[1]  # Dimension of embeddings

# kmeans = faiss.Kmeans(d=d, k=n_clusters, niter=20, verbose=True)

# # Train FAISS KMeans
# try:
#     kmeans.train(retrieved_embeddings)
#     print("KMeans training completed successfully.")
# except Exception as e:
#     print(f"An error occurred during KMeans training: {e}")

# # Retrieve the cluster centers and labels
# cluster_centers = kmeans.centroids
# _, labels = kmeans.index.search(retrieved_embeddings, 1)

# # Convert labels from 2D to 1D
# labels = labels.flatten()

# print("Cluster centers:", cluster_centers)
# print("Labels:", labels)

# print(f"Memory usage after training: {psutil.virtual_memory().percent}%")

In [40]:
# # Extract embeddings of the retrieved images
# retrieved_embeddings = np.array(samples['embeddings']).astype('float32')

# # Normalize the embeddings
# faiss.normalize_L2(retrieved_embeddings)

# # Set the number of clusters
# num_clusters = 3  

# # Initialize and train K-means
# kmeans = faiss.Kmeans(d=retrieved_embeddings.shape[1], k=num_clusters, niter=20, verbose=True)
# kmeans.train(retrieved_embeddings)

# # Assign cluster labels to the retrieved embeddings
# _, cluster_labels = kmeans.index.search(retrieved_embeddings, 1)
# cluster_labels = cluster_labels.flatten()

# # Add cluster labels to samples_df
# samples_df['cluster_label'] = cluster_labels

In [41]:
# Check the shape of the array
# print("Shape of retrieved_embeddings:", retrieved_embeddings.shape)

In [42]:
def batch_generator(data, batch_size):
    """Yield successive batches from data."""
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]


In [43]:
def incremental_kmeans(data, batch_size, n_clusters, niter=20):
    """Train FAISS KMeans incrementally using batches."""
    d = data.shape[1]
    all_centroids = []

    for batch in batch_generator(data, batch_size):
        if len(batch) < n_clusters:
            print(f"Skipping batch of size {len(batch)} as it's smaller than the number of clusters {n_clusters}")
            continue
        
        # Initialize FAISS KMeans for the batch
        kmeans = faiss.Kmeans(d=d, k=n_clusters, niter=niter, verbose=True)
        kmeans.train(batch)
        all_centroids.append(kmeans.centroids)

    if not all_centroids:
        raise ValueError("No batches were large enough to train on.")

    # Combine centroids from all batches
    combined_centroids = np.vstack(all_centroids)

    # Final KMeans on combined centroids to get final clusters
    final_kmeans = faiss.Kmeans(d=d, k=n_clusters, niter=niter, verbose=True)
    final_kmeans.train(combined_centroids)
    
    return final_kmeans

In [44]:
# Retrieve scores and samples using the function
scores, samples = my_get_nearest_examples()

import psutil
import numpy as np
import faiss
import tracemalloc

# Print the current memory usage
print(f"Memory usage before training: {psutil.virtual_memory().percent}%")

retrieved_embeddings = np.array(samples['embeddings']).astype('float32')
# Normalize the embeddings
faiss.normalize_L2(retrieved_embeddings)
# Check the shape of the array
print("Shape of retrieved_embeddings:", retrieved_embeddings.shape)

# Verify no NaNs or infinite values are present
if np.any(np.isnan(retrieved_embeddings)):
    raise ValueError("Data contains NaNs")
if np.any(np.isinf(retrieved_embeddings)):
    raise ValueError("Data contains infinite values")

# Initialize FAISS KMeans
n_clusters = 3 
batch_size = 19



Memory usage before training: 67.1%
Shape of retrieved_embeddings: (19, 768)


In [45]:
# Start memory tracing
tracemalloc.start()

# Train KMeans incrementally
final_kmeans = incremental_kmeans(retrieved_embeddings, batch_size, n_clusters)

In [46]:
# Retrieve the final cluster centers and labels
cluster_centers = final_kmeans.centroids
_, labels = final_kmeans.index.search(retrieved_embeddings, 1)

In [47]:
# Convert labels from 2D to 1D
labels = labels.flatten()
samples_df['cluster_label'] = labels
print("Cluster centers:", cluster_centers)
print("Labels:", labels)

# Print memory usage snapshot
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')

print("[ Top 10 memory consuming lines ]")
for stat in top_stats[:10]:
    print(stat)

print(f"Memory usage after training: {psutil.virtual_memory().percent}%")

Cluster centers: [[ 0.01654609 -0.07860383 -0.08407672 ...  0.00525103  0.07108025
  -0.01747063]
 [ 0.0036962  -0.07159897 -0.07580122 ... -0.01412799  0.0725638
  -0.02900064]
 [-0.00175342 -0.07461578 -0.07971981 ...  0.00392738  0.05894775
  -0.02776729]]
Labels: [2 1 0 0 0 0 2 0 1 1 2 1 2 1 2 1 2 2 0]
[ Top 10 memory consuming lines ]
d:\5th_year\Advanced_AI\reid_env\lib\ast.py:47: size=10.1 MiB, count=181077, average=58 B
<string>:1: size=7528 KiB, count=64242, average=120 B
d:\5th_year\Advanced_AI\reid_env\lib\tokenize.py:528: size=6254 KiB, count=114351, average=56 B
d:\5th_year\Advanced_AI\reid_env\lib\site-packages\asttokens\line_numbers.py:60: size=3502 KiB, count=128078, average=28 B
d:\5th_year\Advanced_AI\reid_env\lib\site-packages\asttokens\line_numbers.py:44: size=2595 KiB, count=13134, average=202 B
d:\5th_year\Advanced_AI\reid_env\lib\site-packages\executing\executing.py:241: size=2098 KiB, count=31977, average=67 B
d:\5th_year\Advanced_AI\reid_env\lib\site-packages\a

In [52]:
import os
from PIL import Image

images_folder = 'images'
clustered_folder = 'clustered_retrieved_faiss'
os.makedirs(retrieved_folder, exist_ok=True)

# Display images grouped by cluster
for cluster in range(n_clusters):
    print(f"Cluster {cluster}")
    cluster_df = samples_df[samples_df['cluster_label'] == cluster]
    for _, row in cluster_df.iterrows():
        print(f"Caption: {row.caption}")
        print(f"Score: {row.scores}")
        print(f"Image: {row.image}")
        print("=" * 50)

        image_path = os.path.join(images_folder, row.image)
        if os.path.exists(image_path):
            image = Image.open(image_path)
            image.show()
            save_path = os.path.join(clustered_folder, f"cluster_{cluster}_{row.image}")
            image.save(save_path)
            print(f"Image saved to {save_path}")
        else:
            print(f"Image file {image_path} not found.")
        print()


Cluster 0
Caption: Two dogs playing on carpet .
Score: 19.354385375976562
Image: 2714699748_c9270dd5aa.jpg
Image saved to clustered_retrieved_faiss\cluster_0_2714699748_c9270dd5aa.jpg

Caption: Two dogs groom each other .
Score: 19.30721664428711
Image: 3502563726_30d1ce29c8.jpg
Image saved to clustered_retrieved_faiss\cluster_0_3502563726_30d1ce29c8.jpg

Caption: Two dogs are playing together .
Score: 19.159297943115234
Image: 2584412512_6767593f24.jpg
Image saved to clustered_retrieved_faiss\cluster_0_2584412512_6767593f24.jpg

Caption: A black and a tan dog .
Score: 19.05617904663086
Image: 2466171100_5e60cfcc11.jpg
Image saved to clustered_retrieved_faiss\cluster_0_2466171100_5e60cfcc11.jpg

Caption: Two dogs sharing a toy .
Score: 18.76192855834961
Image: 2428797297_7fc3c862db.jpg
Image saved to clustered_retrieved_faiss\cluster_0_2428797297_7fc3c862db.jpg

Caption: Two dogs .
Score: 15.053194046020508
Image: 3340857141_85d97a7466.jpg
Image saved to clustered_retrieved_faiss\clust