In [1]:
from PIL import Image
import os
import pandas as pd
import numpy as np
import torch

In [2]:
import open_clip
# get the model and tokenizer 
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
tokenizer = open_clip.get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
from urllib.request import urlopen
from PIL import Image

template = 'this is a photo of '
labels = [
    'genomics visualization',
    'gene schematic',
    'lollipop plot',
    'manhattan plot',
    'umap',
    'pca',
    'pie chart',
    'heatmap',
    'line graph',
    'histogram',
    'box plot',
    'violin plot',
    'scatter plot',
    'bar plot',
    'stacked bar chart',
    'dot plot',
    'bubble chart',
    'network diagram',
    'phylogenetic tree',
    'flow chart',
    'venn diagram',
    'upset plot',
    'sankey diagram',
    'circos plot',
    'aligned peaks',
    'mutation signature',
    'plant',
    'western blot',
    'sashimi plot',
    'volcano plot',
    'flow cytometry visualization',
    'sanger trace',
    'dose-response curve',
    'Kaplan–Meier curves',
    'stained cells',
    'gel electrophoresis',
    'gene enrichment',
    'sequence logos',
    'multiple seqeunce alignment',
    'circular genomic visualization',
    'bam pileup',
    'treatment conditions',
    'protein structure',
    'table'
]

# set the directory where the images are
image_folder = "screenshots"
test_imgs = [
    os.path.join(image_folder, file)
    for file in os.listdir(image_folder)
    if file.endswith((".png", ".jpg", ".gif"))
]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

context_length = 256
batch_size = 50

# this will store the top label for each image
top_label = []
# this will store the embeddings of each image
clip_embeddings = torch.Tensor()


for i in range(0, len(test_imgs), batch_size):
    print(f"Processing {i} to {i+batch_size} of {len(test_imgs)}")
    if i + batch_size > len(test_imgs):
        batch_size = len(test_imgs) - i
    batch = test_imgs[i:i+batch_size]
    images = torch.stack([preprocess_val(Image.open(img)) for img in batch]).to(device)
    texts = tokenizer([template + l for l in labels], context_length=context_length).to(device)

    with torch.no_grad():
        image_features, text_features, logit_scale = model(images, texts)

        logits = (logit_scale * image_features @ text_features.t()).detach().softmax(dim=-1)
        sorted_indices = torch.argsort(logits, dim=-1, descending=True)
        clip_embeddings = torch.cat((clip_embeddings, image_features / image_features.norm(dim=-1, keepdim=True)))

        logits = logits.cpu().numpy()
        sorted_indices = sorted_indices.cpu().numpy()
        top = [labels[i] for i in sorted_indices[:, 0]]
        top_label += top

Processing 0 to 50 of 7296
Processing 50 to 100 of 7296
Processing 100 to 150 of 7296
Processing 150 to 200 of 7296
Processing 200 to 250 of 7296
Processing 250 to 300 of 7296
Processing 300 to 350 of 7296
Processing 350 to 400 of 7296
Processing 400 to 450 of 7296
Processing 450 to 500 of 7296
Processing 500 to 550 of 7296
Processing 550 to 600 of 7296
Processing 600 to 650 of 7296
Processing 650 to 700 of 7296
Processing 700 to 750 of 7296
Processing 750 to 800 of 7296
Processing 800 to 850 of 7296
Processing 850 to 900 of 7296
Processing 900 to 950 of 7296
Processing 950 to 1000 of 7296
Processing 1000 to 1050 of 7296
Processing 1050 to 1100 of 7296
Processing 1100 to 1150 of 7296
Processing 1150 to 1200 of 7296
Processing 1200 to 1250 of 7296
Processing 1250 to 1300 of 7296
Processing 1300 to 1350 of 7296
Processing 1350 to 1400 of 7296
Processing 1400 to 1450 of 7296
Processing 1450 to 1500 of 7296
Processing 1500 to 1550 of 7296
Processing 1550 to 1600 of 7296
Processing 1600 to 

In [10]:
data  = {
    "image": test_imgs,
    "top_label": top_label,
    "clip_embeddings": list(clip_embeddings.cpu().numpy())
}
df = pd.DataFrame(data)
df

Unnamed: 0,image,top_label,clip_embeddings
0,screenshots/heatmap_sw_1_2_s_1_0.png,heatmap,"[0.022331731, 0.007818277, -0.16542432, 0.0080..."
1,screenshots/two_by_two_p_4_m_10_sw_0_7_s_1_2.png,genomics visualization,"[0.019216416, 0.03744839, -0.12684523, -0.0058..."
2,screenshots/multiple_view_p_2_m_12_sw_0_7_s_0_...,circular genomic visualization,"[-0.0016217616, -0.051115554, -0.07445832, -0...."
3,screenshots/two_by_two_uneven_w_m_20_sw_0_7_s_...,genomics visualization,"[-0.0015807527, 0.038776863, -0.13082261, 0.05..."
4,screenshots/multiple_view_p_1_m_8_sw_1_2_s_1_0...,circular genomic visualization,"[-0.002367256, -0.0421267, -0.04676023, -0.033..."
...,...,...,...
7291,screenshots/three_composite_v_p_0_m_0_sw_1_2_s...,dot plot,"[0.039080445, 0.014055151, -0.1260775, -0.0130..."
7292,screenshots/two_by_two_uneven_h_p_0_sw_0_7_s_1...,genomics visualization,"[0.025236, 0.035443608, -0.09266704, -0.011160..."
7293,screenshots/three_composite_m_9_sw_0_7_s_1_2.png,genomics visualization,"[0.0100230975, -0.0006275425, -0.14095412, -0...."
7294,screenshots/multi_view_link_p_0_m_11_sw_1_0_s_...,circular genomic visualization,"[-0.026482897, -0.019669285, -0.04968637, -0.0..."


In [13]:
df.to_parquet("embeddings/biomedclip_embeddings.parquet")

# Upload embeddings to Nomic Atlas 

This is a visualization tool for embeddings. 

In [29]:
from nomic import atlas

paper_ids = [name.split("/")[-1].split("_")[0] for name in test_imgs]
dataset = atlas.map_data(
    data=[{"name": name, "paper": paper, "classification": label} for name, paper, label in zip(test_imgs, paper_ids, top_label)], id_field="name", embeddings=np.array(clip_embeddings)
)

[32m2024-04-04 17:40:54.042[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m897[0m - [1mCreating dataset `analytical-ride`[0m
[32m2024-04-04 17:40:54.411[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m108[0m - [1mUploading data to Atlas.[0m
1it [00:06,  6.94s/it]
[32m2024-04-04 17:41:01.402[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1567[0m - [1mUpload succeeded.[0m
[32m2024-04-04 17:41:01.407[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m123[0m - [1m`quackmires/analytical-ride`: Data upload succeeded to dataset`[0m
[32m2024-04-04 17:41:02.959[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1276[0m - [1mCreated map `analytical-ride` in dataset `quackmires/analytical-ride`: https://atlas.nomic.ai/data/quackmires/analytical-ride/map[0m


In [35]:
paper_ids = [name.split("/")[-1].split("_")[0] for name in names]

In [30]:
embeddings.shape

NameError: name 'embeddings' is not defined