In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
%pip install -qq -U datasets transformers pyarrow
%pip install -qq --upgrade transformers ftfy accelerate regex tqdm
%pip install git+https://github.com/openai/CLIP.git

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-1f1_yadz
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-1f1_yadz
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


Let's import all the libraries we need

In [7]:
import os
import torch
import requests
import json
import pickle

from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from PIL import Image
from datasets import load_dataset
from pathlib import Path
from random import shuffle
from transformers import AutoProcessor, CLIPVisionModel


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to {device}")

Device set to cuda


**Download the CLIP model to encode the image**

In [9]:
clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
clip_preprocess = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

**Let's Write a method to encode the image using clip model**

In [10]:
def calc_image_emb(img, model, preprocess, device):
    """
    This method computes the clip embeddings for a given image, after preprocessing it according to the model
    """
    #image = preprocess(img).unsqueeze(0).to(device)
    processed_image = preprocess(images=img, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**processed_image)
        last_hidden_state = outputs.last_hidden_state
        #image_features = model.encode_image(image)
        #outputs['last_hidden_state'].shape
    return last_hidden_state.squeeze()
    


**Lets test the CLIP embeddings on a random image from COCO**

In [11]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
processed_image = clip_preprocess(images=image, return_tensors="pt")
with torch.no_grad():
    outputs = clip_model(**processed_image)
    last_hidden_state = outputs.last_hidden_state
print(outputs.pooler_output.shape)
img_feat = outputs.pooler_output

torch.Size([1, 768])


**We need to download COCO2017 images, and compute the CLIP embeddings and store them first**

In [12]:
def prepare_image_embeddings(img_path_list, root_dir = None):
    """
    This method computes the CLIP image embeddings for all the images in COCO 2017 dataset
    """
    embeddings_dict = {}
    for f_name in tqdm(img_path_list):
        if root_dir is not None:
            f_name = os.path.join(root_dir, f_name)
        img = Image.open(f_name)
        f_base = Path(f_name).stem
        img_embd = calc_image_emb(img, clip_model, clip_preprocess, device)
        embeddings_dict[f_base] = img_embd.squeeze().tolist()
    return embeddings_dict

def get_absolute_paths(directory_path, max_files = None):
    absolute_paths = []

    # Check if the given path is a valid directory
    if os.path.isdir(directory_path):
        # Iterate over all files in the directory
        for root, _, files in tqdm(os.walk(directory_path)):
            for file in files:
                # Construct the absolute path for each file
                absolute_path = os.path.abspath(os.path.join(root, file))
                absolute_paths.append(absolute_path)
                if max_files is not None and len(absolute_paths) > max_files:
                    break
    return absolute_paths

def prepare_files_list(dataset_path, dirs = None):
    files_list = []
    for each_dir in dirs:
        files_list.extend(get_absolute_paths(os.path.join(dataset_path, each_dir)))
    return files_list


def write_dict_to_json(data_dict, file_path):
    """
    Write a dictionary to a JSON file.

    Parameters:
    - dictionary: The dictionary to be written to the file.
    - file_path: The path to the JSON file.
    """
    with open(file_path, 'w') as json_file:
        json.dump(data_dict, json_file, indent=4)
        

def read_json_file(file_path):
    """
    Read a JSON file and return its contents as a dictionary.

    Parameters:
    - file_path (str): The path to the JSON file.

    Returns:
    - dict: The contents of the JSON file as a dictionary.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
    except json.JSONDecodeError:
        print(f"Error: Unable to decode JSON in file - {file_path}")

        
def list_of_dicts_to_dict_of_dicts(list_of_dicts, key):
    """
    Convert a list of dictionaries to a dictionary of dictionaries using a specified key.

    Parameters:
    - list_of_dicts (list): A list of dictionaries.
    - key (str): The key to use as the identifier.

    Returns:
    - dict: A dictionary of dictionaries with the specified key.
    """
    result_dict = {}

    for item in list_of_dicts:
        identifier = item.get(key)
        if identifier is not None:
            result_dict[identifier] = item

    return result_dict

def parse_metadata(metadata_path, dict_key = 'id' ):
    """
    first read json file, then convert to dict
    """
    metadata = read_json_file(metadata_path)
    metadata_dict = list_of_dicts_to_dict_of_dicts(metadata, key = dict_key)
    return metadata_dict

**Load embeddings for COCO dataset. If embeddings are not available, prepare the same**

In [13]:
train_dataset_path = '/kaggle/input/coco-2017-dataset/coco2017/train2017'
#files_list = get_absolute_paths(train_dataset_path)
with open('/kaggle/working/files_list_3.pkl','rb') as fh:
    files_list_3 = pickle.load(fh)

In [None]:
#shuffle(files_list)
#files_list_1 = files_list[:50000]
#files_list_2 = files_list[50001:100000]
#files_list_3 = files_list[100001:]
# Write files list to pkl files
#with open('files_list_1.pkl','wb') as fh:
#    pickle.dump(files_list_1, fh)
#with open('files_list_2.pkl','wb') as fh:
#    pickle.dump(files_list_2, fh)
#with open('files_list_3.pkl','wb') as fh:
#    pickle.dump(files_list_3, fh)

In [17]:
!rm '/kaggle/working/coco_embeddings.json'

In [None]:
pkl_file_path = '/kaggle/working/coco_embeddings_3.pkl'
json_file_path = '/kaggle/working/coco_embeddings_3.json'

embeddings_dict = prepare_image_embeddings(files_list_3[:15000])

# Write to pkl file
with open(pkl_file_path,'wb') as fh:
    pickle.dump(embeddings_dict, fh)

# Write to JSON
write_dict_to_json(embeddings_dict, json_file_path)

  0%|          | 0/15000 [00:00<?, ?it/s]

In [None]:
import numpy as np
c = embeddings_dict['000000501175']
c2 = np.asarray(c)


**Compute CLIP embeddings for CC3M dataset (https://huggingface.co/datasets/liuhaotian/LLaVA-CC3M-Pretrain-595K)**

**First load the metadata file for captions**

In [None]:
json_file_path = '/kaggle/input/cc3m-captions/metadata.json'
metadata_key = 'id'
cc3m_metadata = parse_metadata(json_file_path, dict_key = metadata_key)

**Gather the list of images in our dataset**

In [None]:
cc3m_path = '/kaggle/input/cc3m-pretrain'
files_list = os.listdir(cc3m_path)
shuffle(files_list)
max_images = 175000
files_list = files_list[:max_images]


**Calculate clip embeddings for all the images**

In [None]:
embedding_dict = prepare_image_embeddings(files_list, root_dir = cc3m_path)

In [None]:
embedding_dict

In [None]:
def update_captions_metadata_with_embeddings(cc3m_metadata, embedding_dict):
    cc3m_metadata_updated = {}
    for key,embedding in tqdm(embedding_dict.items()):
        cc3m_metadata_updated[key] = {
            'clip_embeddings': embedding,
            'id': key,
            'caption': cc3m_metadata[key]['caption'],
            'blip_caption': cc3m_metadata[key]['blip_caption']
        }
    return cc3m_metadata_updated


In [None]:
all_keys = list(metadata.keys())
print(all_keys[0])
metadata[all_keys[0]]

In [None]:
im = Image.open('/kaggle/input/cc3m-pretrain/GCC_train_002582585.jpg')
im

In [None]:
updated_metadata = update_captions_metadata_with_embeddings(cc3m_metadata, embedding_dict)

**write updated metadata to json**

In [None]:
# Write to JSON
cc3m_metadata_path = 'cc3m_captions_and_clip_embeddings.json'
write_dict_to_json(updated_metadata, cc3m_metadata_path)

**Write updated metadata to pickle**

In [None]:
# Write to pkl file
pkl_file_path = 'cc3m_captions_and_clip_embeddings.pkl'
with open(pkl_file_path,'wb') as fh:
    pickle.dump(updated_metadata, fh)