## Reading from file

In [2]:
import pandas as pd
import umap
import jscatter

In [3]:
# Load the parquet file
df = pd.read_parquet('embeddings/biomedclip_embeddings.parquet')
# Get rid of the directory path
df['image'] = df['image'].apply(lambda x: x.split('/')[-1])

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s  # Return the whole string if no match is found

df["url"] = df['image'].apply(attach_image)

#  --------- Prepare labels ---------
import re
def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found


# Apply the function to the 'Label' column
df['Label'] = df['image'].apply(extract_substring)
    
df

Unnamed: 0,image,top_label,clip_embeddings,url,Label
0,heatmap_sw_1_2_s_1_0.png,heatmap,"[0.022331731, 0.007818277, -0.16542432, 0.0080...",https://raw.githubusercontent.com/huyen-nguyen...,heatmap
1,two_by_two_p_4_m_10_sw_0_7_s_1_2.png,genomics visualization,"[0.019216416, 0.03744839, -0.12684523, -0.0058...",https://raw.githubusercontent.com/huyen-nguyen...,two_by_two
2,multiple_view_p_2_m_12_sw_0_7_s_0_7.png,circular genomic visualization,"[-0.0016217616, -0.051115554, -0.07445832, -0....",https://raw.githubusercontent.com/huyen-nguyen...,multiple_view
3,two_by_two_uneven_w_m_20_sw_0_7_s_0_7.png,genomics visualization,"[-0.0015807527, 0.038776863, -0.13082261, 0.05...",https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_uneven_w
4,multiple_view_p_1_m_8_sw_1_2_s_1_0.png,circular genomic visualization,"[-0.002367256, -0.0421267, -0.04676023, -0.033...",https://raw.githubusercontent.com/huyen-nguyen...,multiple_view
...,...,...,...,...,...
7291,three_composite_v_p_0_m_0_sw_1_2_s_2_0.png,dot plot,"[0.039080445, 0.014055151, -0.1260775, -0.0130...",https://raw.githubusercontent.com/huyen-nguyen...,three_composite_v
7292,two_by_two_uneven_h_p_0_sw_0_7_s_1_2.png,genomics visualization,"[0.025236, 0.035443608, -0.09266704, -0.011160...",https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_uneven_h
7293,three_composite_m_9_sw_0_7_s_1_2.png,genomics visualization,"[0.0100230975, -0.0006275425, -0.14095412, -0....",https://raw.githubusercontent.com/huyen-nguyen...,three_composite
7294,multi_view_link_p_0_m_11_sw_1_0_s_0_7.png,circular genomic visualization,"[-0.026482897, -0.019669285, -0.04968637, -0.0...",https://raw.githubusercontent.com/huyen-nguyen...,multi_view_link


## Initialize UMAP

In [4]:
# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df['clip_embeddings'])

# Initialize UMAP. Reduce dimensionality to 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=50,
    n_components=2,
    metric='euclidean',
    min_dist=0.5,
    spread=0.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral'
)
umap_embeddings = reducer.fit_transform(embeddings)

# Create a DataFrame with the UMAP embeddings
embedding_df = pd.DataFrame(umap_embeddings, columns=['UMAP_1', 'UMAP_2'])

df['UMAP_1'] = embedding_df['UMAP_1']
df['UMAP_2'] = embedding_df['UMAP_2']

## Embeddings from Images with CLIP

In [5]:
# ------- Visualize --------
# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.8,
    "axes_grid": True,
}

jscatter.Scatter(
    data=df,
    x="UMAP_1",
    y="UMAP_2",
    color_by="Label",
    **config,
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color"],
).show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…

In [6]:
# ------- Visualize --------
# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.8,
    "axes_grid": True,
}

jscatter.Scatter(
    data=df,
    x="UMAP_1",
    y="UMAP_2",
    color_by="top_label",
    **config,
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color"],
).show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…