In [33]:
import torch
from datasets import load_dataset
from PIL import Image
from torchmultimodal.models.flava.model import flava_model
from torchmultimodal.transforms.flava_transform import FLAVAImageTransform
from transformers import BertTokenizer

[{'id': [152628], 'area': [85550.11445000001], 'bbox': [[39.90999984741211, 97.08000183105469, 511.2799987792969, 326.8299865722656]], 'label': [4], 'iscrowd': [False]}, {'id': [341560, 425599, 1982651], 'area': [632.9258500000003, 82680.72434999997, 12868.869650000002], 'bbox': [[265.05999755859375, 126.02999877929688, 33.86000061035156, 66.26000213623047], [20.600000381469727, 1.0700000524520874, 270.4100036621094, 382.42999267578125], [268.6099853515625, 69.66000366210938, 222.67999267578125, 88.9000015258789]], 'label': [90, 1, 81], 'iscrowd': [False, False, False]}, {'id': [435260, 474294, 517321, 562804, 565540, 1042342, 1044205, 1045269, 1045368, 1046442, 1046668, 1245662, 1370458, 1542652, 1899986, 2110381, 2110889, 2110962, 2160368, 2186237], 'area': [12333.025599999999, 11494.345149999997, 4143.89425, 15237.131249999999, 3487.6596, 1287.7461999999991, 3870.9556000000002, 612.23305, 3296.4238500000006, 546.5966499999997, 1696.7556000000002, 2122.1392499999997, 2127.91509999999

In [21]:
import os
from datasets import load_dataset
exeSpace = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/"
targetPathBase = exeSpace + 'datasets/coco'
PATH_TO_IMAGE_FOLDER = targetPathBase

def create_full_path(example):
    """Create full path to image using `base_path` to COCO2017 folder."""
    example["image_path"] = os.path.join(PATH_TO_IMAGE_FOLDER, example["file_name"])
    return example

dataset = load_dataset("phiyodr/coco2017")
dataset = dataset.map(create_full_path)

Map: 100%|██████████| 118287/118287 [00:10<00:00, 11101.77 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 12396.77 examples/s]


In [40]:
def encode_images(image_paths, N):
    # Specify the GPU by index (e.g., use GPU 1)
    gpu_index = 0  # Change this to the desired GPU index
    device = torch.device(f"cuda:{gpu_index}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load pretrained FLAVA model and move it to the correct device
    model = flava_model(pretrained=True).to(device)
    model.eval()
    # Define the image transform using FLAVA's image preprocessing
    image_transform = FLAVAImageTransform(is_train=False)
    image_tensors = []
    
    # Process the first N images in the list
    for image_path in image_paths[:N]:
        # Open the image file
        image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB mode
        # Apply the FLAVA image transform
        image_tensor = image_transform(image)["image"].unsqueeze(0)  # Add batch dimension
        image_tensors.append(image_tensor)
    
    # Stack the image tensors into a batch
    image_tensors = torch.cat(image_tensors, dim=0).to(device)  # Move to GPU if available
    
    # Encode the images using FLAVA's image encoder
    with torch.no_grad():
        _, image_embeddings = model.encode_image(image_tensors, projection=True)
    
    return image_embeddings
import numpy as np
def append_to_fvecs(file_path, vectors):
        """ Appends the vectors to an .fvecs file. """
        with open(file_path, 'ab') as f:
            for vec in vectors:
                dim = np.array([vec.shape[0]], dtype=np.int32)  # First write the dimension
                vec = vec.cpu().numpy().astype(np.float32)       # Convert to numpy float32
                dim.tofile(f)                                    # Write dimension
                vec.tofile(f)       
# Micro-batched image encoder with flushing after every batch
def encode_images_to_fvecs(image_paths, N, batch_size, output_file):
    total_images = min(N, len(image_paths))  # Ensure we don't exceed available images
     # Specify the GPU by index (e.g., use GPU 1)
    gpu_index = 0  # Change this to the desired GPU index
    device = torch.device(f"cuda:{gpu_index}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load pretrained FLAVA model and move it to the correct device
    model = flava_model(pretrained=True).to(device)
    model.eval()
    # Define the image transform using FLAVA's image preprocessing
    image_transform = FLAVAImageTransform(is_train=False)
    image_tensors = []
    # Process images in micro-batches and flush after every batch
    for i in range(0, total_images, batch_size):
        # Get the paths for the current micro-batch
        batch_paths = image_paths[i:i + batch_size]
        
        image_tensors = []
        for image_path in batch_paths:
            # Open the image file
            image = Image.open(image_path).convert("RGB")  # Ensure the image is in RGB mode
            # Apply the FLAVA image transform
            image_tensor = image_transform(image)["image"].unsqueeze(0)  # Add batch dimension
            image_tensors.append(image_tensor)
        
        # Stack the image tensors into a batch
        image_tensors = torch.cat(image_tensors, dim=0).to(device)  # Move to GPU if available
        
        # Encode the images using FLAVA's image encoder
        with torch.no_grad():
            _, image_embeddings = model.encode_image(image_tensors, projection=True)
        
        # Append the encoded embeddings to the .fvecs file
        append_to_fvecs(output_file, image_embeddings)

        print(f"Processed batch {i // batch_size + 1}, flushed to {output_file}")

    print(f"Finished encoding {total_images} images and saved to {output_file}")
# Micro-batched caption encoder with flushing after every batch
def encode_captions(captions, N, batch_size, output_file):
    total_captions = min(N, len(captions))  # Ensure we don't exceed available captions
    # Load BERT tokenizer from Hugging Face for text tokenization
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
     # Specify the GPU by index (e.g., use GPU 1)
    gpu_index = 0  # Change this to the desired GPU index
    device = torch.device(f"cuda:{gpu_index}" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load pretrained FLAVA model and move it to the correct device
    model = flava_model(pretrained=True).to(device)
    model.eval()
    # Process captions in micro-batches and flush after every batch
    for i in range(0, total_captions, batch_size):
        # Get the captions for the current micro-batch
        batch_captions = captions[i:i + batch_size]
        
        # Tokenize captions and convert to tensors
        inputs = tokenizer(batch_captions, return_tensors="pt", padding=True, truncation=True, max_length=128)
        text_tensors = inputs['input_ids'].to(device)  # Move to GPU if available
        
        # Encode the texts using FLAVA's text encoder
        with torch.no_grad():
            _, text_embeddings = model.encode_text(text_tensors, projection=True)
        
        # Append the encoded embeddings to the .fvecs file
        append_to_fvecs(output_file, text_embeddings)

        print(f"Processed batch {i // batch_size + 1}, flushed to {output_file}")

    print(f"Finished encoding {total_captions} captions and saved to {output_file}")
    return text_embeddings

In [None]:
import numpy as np
import random
# Function to read vectors from an *.fvecs file
def read_fvecs(file_path):
    vectors = []
    with open(file_path, 'rb') as f:
        while True:
            # Read the dimension (first 4 bytes)
            dim_bytes = f.read(4)
            if not dim_bytes:
                break  # End of file
            dim = np.frombuffer(dim_bytes, dtype=np.int32)[0]
            
            # Read the vector based on the dimension
            vec = np.frombuffer(f.read(4 * dim), dtype=np.float32)
            vectors.append(vec)
    return vectors

# Function to append two *.fvecs files and save the result into a new file
def append_fvecs(file1, file2, output_file):
    # Read vectors from both fvecs files
    vectors1 = read_fvecs(file1)
    vectors2 = read_fvecs(file2)
    
    # Combine the vectors
    combined_vectors = vectors1 + vectors2
    
    # Save the combined vectors to a new .fvecs file
    with open(output_file, 'wb') as f:
        for vec in combined_vectors:
            dim = np.array([vec.shape[0]], dtype=np.int32)  # Write dimension
            dim.tofile(f)
            vec.astype(np.float32).tofile(f)  # Write vector values

    print(f"Appended {len(vectors2)} vectors from {file2} to {file1}, saved to {output_file}")

# Function to shuffle and save combined vectors into a new *.fvecs file
def shuffle_and_save_fvecs(file1, file2, output_file):
    # Read vectors from both fvecs files
    vectors1 = read_fvecs(file1)
    vectors2 = read_fvecs(file2)
    
    # Combine the vectors from both files
    combined_vectors = vectors1 + vectors2
    
    # Shuffle the combined vectors
    random.shuffle(combined_vectors)
    
    # Save the shuffled vectors to a new .fvecs file
    with open(output_file, 'wb') as f:
        for vec in combined_vectors:
            dim = np.array([vec.shape[0]], dtype=np.int32)  # Write dimension
            dim.tofile(f)
            vec.astype(np.float32).tofile(f)  # Write vector values

    print(f"Shuffled {len(combined_vectors)} vectors from {file1} and {file2}, saved to {output_file}")

In [45]:
print((dataset['train']['captions'][0]))
captions = dataset['train']['captions']
captionFirst =  [caption[0] for caption in captions[:len(captions)]]
print(captionFirst[1])
caps = encode_captions(captionFirst,128,128,'captions.fvecs')
images = encode_images_to_fvecs(dataset['train']['image_path'],128,128,'image.fvecs')

['A man with a red helmet on a small moped on a dirt road. ', 'Man riding a motor bike on a dirt road on the countryside.', 'A man riding on the back of a motorcycle.', 'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', 'A man in a red shirt and a red hat is on a motorcycle on a hill side.']
A woman wearing a net on her head cutting a cake. 




Using device: cuda:0


  state_dict = torch.load(local_path)


Processed batch 1, flushed to captions.fvecs
Finished encoding 128 captions and saved to captions.fvecs
Using device: cuda:0
Processed batch 1, flushed to image.fvecs
Finished encoding 128 images and saved to image.fvecs


In [48]:
print(dataset['validation'])

Dataset({
    features: ['license', 'file_name', 'coco_url', 'height', 'width', 'date_captured', 'flickr_url', 'image_id', 'ids', 'captions', 'image_path'],
    num_rows: 5000
})


print