In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import json
import clip_.clip as clip
from clip_.clip import _transform
from clip_.model import CLIPGeneral
from tqdm.notebook import tqdm
from PIL import Image



checkpoint_paths = [
    '/data/checkpoints/checkpoints_icml22/yfcc/clip_rn50_yfcc_epoch_28.pt',
    '/data/checkpoints/checkpoints_icml22/yfcc/cloob_rn50_yfcc_epoch_28.pt',
]

df_jpg = pd.read_csv('/data/csv/jpg_path_fingings.csv')
df_jpg = df_jpg.iloc[:, :-1]

model_names = ['clip_rn50_yfcc_epoch_28', 'cloob_rn50_yfcc_epoch_28']

In [2]:
#Parameters
model_to_use = 0
model_to_embed = model_names[model_to_use]
#numb_images = 10000
number_of_gpu = "0"
print(model_to_embed)

clip_rn50_yfcc_epoch_28


In [3]:
embedding_directory = "/data/embbeding/embeddings/rn50_yfcc/" + model_to_embed
os.makedirs(embedding_directory, exist_ok=True)  # Create the embedding directory if it doesn't exist

In [4]:
def load_checkpoint(checkpoint_path):
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path)
    model_config_file = os.path.join('/data/model_config', checkpoint['model_config_file'])
    
    device = torch.device("cuda:"+number_of_gpu if torch.cuda.is_available() else "cpu")
    print("Device is", device)

    # Load model config
    assert os.path.exists(model_config_file)
    with open(model_config_file, 'r') as f:
        model_info = json.load(f)
    model = CLIPGeneral(**model_info)
    preprocess = _transform(model.visual.input_resolution, is_train=False)

    # Load model state dictionary
    sd = checkpoint["state_dict"]
    sd = {k[len('module.'):]: v for k, v in sd.items()}  # Remove 'module.' prefix from keys for DataParallel
    if 'logit_scale_hopfield' in sd:
        sd.pop('logit_scale_hopfield', None)
    model.load_state_dict(sd)

    model = model.to(device)
    model.eval()

    return model, preprocess, device




In [5]:
#df_jpg add a column with the jpg_name the name is the last part of the path
df_jpg['jpg_name'] = df_jpg['jpg_path'].apply(lambda x: x.split('/')[-1])
#delet the .jpg from the name this are always the last 4 letters
df_jpg['jpg_name'] = df_jpg['jpg_name'].apply(lambda x: x[:-4])
df_jpg.head()


Unnamed: 0,study_id,jpg_path,jpg_name
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714


In [7]:
os.makedirs(embedding_directory, exist_ok=True)  # Create the embedding directory if it doesn't exist

# Load the checkpoint and model
model, preprocess, device = load_checkpoint(checkpoint_paths[model_to_use])
device = torch.device("cuda:"+number_of_gpu if torch.cuda.is_available() else "cpu")
model = model.to(device)

def save_embedding(image_path, embedding_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).cuda(device=device).type(model.dtype)
    with torch.no_grad():
        embedding = model.encode_image(image)
    torch.save(embedding, embedding_path)

# Iterate over the DataFrame and save embeddings
embeddings = []
skipped = 0
with tqdm(total=len(df_jpg)) as pbar:
    for index, row in df_jpg.iterrows():
        image_path = row['jpg_path']
        embedding_path = os.path.join(embedding_directory, row['jpg_name'] + '.pt')
        if os.path.exists(embedding_path):
            skipped += 1
            df_jpg.at[index, 'embedding_path_' + model_to_embed] = embedding_path
            continue
        save_embedding(image_path, embedding_path)
        df_jpg.at[index, 'embedding_path_' + model_to_embed] = embedding_path
        pbar.update(1)
#clear cache
torch.cuda.empty_cache()

print(f"Skipped {skipped} embeddings.")  # Print the number of skipped embeddings.


Device is cuda:0


  0%|          | 0/377095 [00:00<?, ?it/s]

Skipped 0 embeddings.


In [8]:
df_jpg

Unnamed: 0,study_id,jpg_path,jpg_name,embedding_path_clip_rn50_yfcc_epoch_28
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,/system/user/publicdata/MIMIC_CXR/hageneder/em...
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,/system/user/publicdata/MIMIC_CXR/hageneder/em...
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,/system/user/publicdata/MIMIC_CXR/hageneder/em...
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,/system/user/publicdata/MIMIC_CXR/hageneder/em...
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,/system/user/publicdata/MIMIC_CXR/hageneder/em...
...,...,...,...,...
377090,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,/system/user/publicdata/MIMIC_CXR/hageneder/em...
377091,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,/system/user/publicdata/MIMIC_CXR/hageneder/em...
377092,55368167,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,58766883-376a15ce-3b323a28-6af950a0-16b793bd,/system/user/publicdata/MIMIC_CXR/hageneder/em...
377093,58621812,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,/system/user/publicdata/MIMIC_CXR/hageneder/em...


In [9]:
#coutn the number of files in the embedding directory
print(len(os.listdir(embedding_directory)))

377095


In [10]:
#save the embedding in a dataframe as csv name should be model_to_embed
df_embedding = df_jpg.copy()  # create a copy of df_jpg
df_embedding.to_csv('/system/user/publicdata/MIMIC_CXR/hageneder/embeddings/rn50_yfcc/' + model_to_embed + '.csv', index=False)
