In [81]:
import pandas as pd
import umap
import re
import jscatter
import ast  # ast.literal_eval safely evaluates a string containing a Python literal expression

# Initialize UMAP. Reduce dimensionality t 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=120,
    n_components=2,
    metric='euclidean',
    min_dist=5.5,
    spread=6.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral',
    random_state=29,
)

def remove_slash(s):
    return s[1:]

def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s[:-4] + "png"  # Return the whole string if no match is found

def attach_image_from_non_spec(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s  # Return the whole string if no match is found

## Embeddings from Specs, LiT, and CLIP

In [82]:
# --------- Load label file ---------
df_label = pd.read_csv('labels.csv')


# --------- Load Spec Data and Apply UMAP ----------

# Load the CSV file into a DataFrame
df_spec = pd.read_csv('embeddings/spec_embeddings.csv')

# Assume 'filename' is the column to exclude from embeddings
filename_spec = df_spec['filename']

# Extract all other columns as embeddings
embeddings_spec = df_spec.drop('filename', axis=1)

# Apply UMAP 
umap_embeddings_spec = reducer.fit_transform(embeddings_spec)

# --------- Load LiT Data and Apply UMAP ----------

# Specify column names
column_names = ['Filename', 'Embeddings']

# Load the CSV file
df_lit = pd.read_csv('embeddings/image_lit_embeddings.tsv', delimiter='\t', names=column_names)

df_lit['embeddings'] = df_lit['Embeddings'].apply(ast.literal_eval)
df_lit['filename'] = df_lit['Filename'].apply(remove_slash)

# Accessing the embedding for the first row
first_embedding = df_lit.loc[0, 'embeddings']

# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df_lit['embeddings'])

umap_embeddings_lit = reducer.fit_transform(embeddings)

# --------- Load CLIP Data and Apply UMAP ----------

# Load the CSV file
df_clip = pd.read_csv('embeddings/image_clip_embeddings.csv')

df_clip['embeddings'] = df_clip['Embeddings'].apply(ast.literal_eval)
# Accessing the embedding for the first row
first_embedding = df_clip.loc[0, 'embeddings']

# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df_clip['embeddings'])

umap_embeddings_clip = reducer.fit_transform(embeddings)

# --------- Prepare labels ---------

# Apply the function to the 'Label' column
df_spec['label'] = df_spec['filename'].apply(extract_substring)
df_lit['label'] = df_lit['filename'].apply(extract_substring)
df_clip['label'] = df_clip['Filename'].apply(extract_substring)

# Display the DataFrame to see the original and trimmed labels
# print(df)



  return 1.0 / (1.0 + a * x ** (2 * b))
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  return 1.0 / (1.0 + a * x ** (2 * b))
  return 1.0 / (1.0 + a * x ** (2 * b))


In [83]:
# ----------- Apply labels -------------

# Convert the embeddings to a DataFrame
umap_embeddings_spec = pd.DataFrame(umap_embeddings_spec, columns=['UMAP_1', 'UMAP_2'])
umap_embeddings_lit = pd.DataFrame(umap_embeddings_lit, columns=['UMAP_1', 'UMAP_2'])
umap_embeddings_clip = pd.DataFrame(umap_embeddings_clip, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
umap_embeddings_spec['Label'] = df_spec['label']
umap_embeddings_lit['Label'] = df_lit['label']
umap_embeddings_clip['Label'] = df_clip['label']

umap_embeddings_spec["url"] = df_spec['filename'].apply(attach_image)
umap_embeddings_lit["url"] = df_lit['filename'].apply(attach_image_from_non_spec)
umap_embeddings_clip["url"] = df_clip['Filename'].apply(attach_image_from_non_spec)

# specify column for merge
umap_embeddings_spec['identifier'] = df_spec['filename'].str.split('.').str[0]
umap_embeddings_lit['identifier'] = df_lit['filename'].str.split('.').str[0]
umap_embeddings_clip['identifier'] = df_clip['Filename'].str.split('.').str[0]

# merge with label
umap_embeddings_spec = pd.merge(umap_embeddings_spec, df_label, on='identifier', how='inner')  
umap_embeddings_lit = pd.merge(umap_embeddings_lit, df_label, on='identifier', how='inner')  
umap_embeddings_clip = pd.merge(umap_embeddings_clip, df_label, on='identifier', how='inner')  


# ----------- Combine -------------

combined = umap_embeddings_spec.merge(umap_embeddings_lit, on=['Label', 'url'], how='left').merge(umap_embeddings_clip, on=['Label', 'url'], how='left')
combined = combined.rename(columns={"UMAP_1_x": "UMAP_1_spec", "UMAP_2_x": "UMAP_2_spec", "UMAP_1_y": "UMAP_1_lit", "UMAP_2_y": "UMAP_2_lit", "UMAP_1": "UMAP_1_clip", "UMAP_2": "UMAP_2_clip"})
combined

# Display the first few rows of the DataFrame
# print(umap_embeddings_spec.head())

Unnamed: 0,UMAP_1_spec,UMAP_2_spec,Label,url,identifier_x,data_label_x,layout_label_x,arrangement_label_x,mark_label_x,alignment_label_x,...,orientation_label_y,UMAP_1_clip,UMAP_2_clip,identifier,data_label,layout_label,arrangement_label,mark_label,alignment_label,orientation_label
0,19.217770,-4.205412,two_by_two,https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_p_4_m_11_sw_1_0_s_1_0,contiguous,linear,multi_arrangements,multi_marks,stack,...,horizontal_orientation,-25.246695,33.593987,two_by_two_p_4_m_11_sw_1_0_s_1_0,contiguous,linear,multi_arrangements,multi_marks,stack,horizontal_orientation
1,59.128807,79.548988,multi_view_link,https://raw.githubusercontent.com/huyen-nguyen...,multi_view_link_p_0_m_12_sw_2_0_s_1_0,contiguous,multi_layouts,multi_arrangements,line_point,stack,...,horizontal_orientation,4.098275,-17.190825,multi_view_link_p_0_m_12_sw_2_0_s_1_0,contiguous,multi_layouts,multi_arrangements,line_point,stack,horizontal_orientation
2,-40.014015,-20.404404,single_cell_epi,https://raw.githubusercontent.com/huyen-nguyen...,single_cell_epi_simple_p_0_m_18_sw_0_7_s_0_7,multi_data_types,linear,vertical,multi_marks,overlay,...,horizontal_orientation,19.557581,-19.629503,single_cell_epi_simple_p_0_m_18_sw_0_7_s_0_7,multi_data_types,linear,vertical,multi_marks,overlay,horizontal_orientation
3,-2.285565,-64.963257,two_horizontal,https://raw.githubusercontent.com/huyen-nguyen...,two_horizontal_m_9_sw_1_2_s_1_2,contiguous,linear,serial,area_line,stack,...,horizontal_orientation,27.260645,33.901482,two_horizontal_m_9_sw_1_2_s_1_2,contiguous,linear,serial,area_line,stack,horizontal_orientation
4,23.618654,50.159332,basic_two_horizontal_orient,https://raw.githubusercontent.com/huyen-nguyen...,basic_two_horizontal_orient_m_6_sw_1_0_s_1_0,contiguous,linear,no_arrangements,area_line,stack,...,vertical_orientation,-7.054295,-5.569937,basic_two_horizontal_orient_m_6_sw_1_0_s_1_0,contiguous,linear,no_arrangements,area_line,stack,vertical_orientation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291,40.007118,56.111099,two_vertical,https://raw.githubusercontent.com/huyen-nguyen...,two_vertical_m_2_sw_0_5_s_0_5,contiguous,linear,no_arrangements,area_point,stack,...,horizontal_orientation,5.385641,4.252075,two_vertical_m_2_sw_0_5_s_0_5,contiguous,linear,no_arrangements,area_point,stack,horizontal_orientation
7292,82.385056,-2.569696,gene_annotation,https://raw.githubusercontent.com/huyen-nguyen...,gene_annotation_simple_p_3_sw_0_5_s_2_0,sparse,linear,multi_arrangements,rect_triangle,overlay,...,horizontal_orientation,58.950657,14.595228,gene_annotation_simple_p_3_sw_0_5_s_2_0,sparse,linear,multi_arrangements,rect_triangle,overlay,horizontal_orientation
7293,49.994808,74.584877,multi_view_link,https://raw.githubusercontent.com/huyen-nguyen...,multi_view_link_p_0_m_19_sw_1_2_s_1_2,contiguous,multi_layouts,multi_arrangements,bar_point,stack,...,horizontal_orientation,-3.431203,-19.666573,multi_view_link_p_0_m_19_sw_1_2_s_1_2,contiguous,multi_layouts,multi_arrangements,bar_point,stack,horizontal_orientation
7294,-1.072334,-74.933830,two_horizontal,https://raw.githubusercontent.com/huyen-nguyen...,two_horizontal_m_2_sw_2_0_s_1_0,contiguous,linear,serial,area_point,stack,...,horizontal_orientation,-4.118951,7.704493,two_horizontal_m_2_sw_2_0_s_1_0,contiguous,linear,serial,area_point,stack,horizontal_orientation


In [84]:
# ----------- Config -------------

# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "color_by": 'data_label',
    "size": 5,
    "axes_labels": True,
    "height": 1000,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.3,
    "axes_grid": True,
    "tooltip": True,
    "tooltip_preview": "url",
    "tooltip_preview_type": "image",
    "tooltip_preview_image_background_color": "white",
    "tooltip_properties": ["color"],
    "data": combined
}

# ----------- Plotting the results using jupyter scatter -----------
jscatter.compose(
    [
        jscatter.Scatter(
            x="UMAP_1_spec", y="UMAP_2_spec", **config,
        ),
        jscatter.Scatter(
            x="UMAP_1_lit", y="UMAP_2_lit", **config,
        ),
        jscatter.Scatter(
            x="UMAP_1_clip", y="UMAP_2_clip", **config
        )
    ],
    sync_selection=True,
    sync_hover=True,
    rows=1
)

GridBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(wi…

In [85]:
umap_embeddings_bio

Unnamed: 0,UMAP_1,UMAP_2,url,identifier,data_label,layout_label,arrangement_label,mark_label,alignment_label,orientation_label
0,-23.259916,-14.080674,https://raw.githubusercontent.com/huyen-nguyen...,heatmap_sw_1_2_s_1_0,contiguous,linear,no_arrangements,rect,stack,horizontal_orientation
1,23.363602,50.739025,https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_p_4_m_10_sw_0_7_s_1_2,contiguous,linear,multi_arrangements,area_line,stack,horizontal_orientation
2,46.658939,-22.275047,https://raw.githubusercontent.com/huyen-nguyen...,multiple_view_p_2_m_12_sw_0_7_s_0_7,contiguous,multi_layouts,multi_arrangements,multi_marks,overlay,horizontal_orientation
3,35.035782,15.762679,https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_uneven_w_m_20_sw_0_7_s_0_7,contiguous,linear,multi_arrangements,multi_marks,stack,horizontal_orientation
4,16.445215,-20.306992,https://raw.githubusercontent.com/huyen-nguyen...,multiple_view_p_1_m_8_sw_1_2_s_1_0,contiguous,multi_layouts,multi_arrangements,multi_marks,overlay,horizontal_orientation
...,...,...,...,...,...,...,...,...,...,...
7291,4.570871,26.685818,https://raw.githubusercontent.com/huyen-nguyen...,three_composite_v_p_0_m_0_sw_1_2_s_2_0,contiguous,linear,multi_arrangements,line_point,stack,horizontal_orientation
7292,21.749504,22.989798,https://raw.githubusercontent.com/huyen-nguyen...,two_by_two_uneven_h_p_0_sw_0_7_s_1_2,contiguous,linear,multi_arrangements,multi_marks,stack,horizontal_orientation
7293,0.739522,22.906290,https://raw.githubusercontent.com/huyen-nguyen...,three_composite_m_9_sw_0_7_s_1_2,contiguous,linear,multi_arrangements,multi_marks,stack,horizontal_orientation
7294,-1.398568,-15.254950,https://raw.githubusercontent.com/huyen-nguyen...,multi_view_link_p_0_m_11_sw_1_0_s_0_7,contiguous,multi_layouts,multi_arrangements,multi_marks,stack,horizontal_orientation


# Comparison 

In [89]:
import pandas as pd
from cev.widgets import Embedding, EmbeddingComparisonWidget

# change column names 
spec_cmp = umap_embeddings_spec.rename(columns={'UMAP_1': 'x', 'UMAP_2': 'y', 'arrangement_label': 'label'}, inplace=False)
clip_cmp = umap_embeddings_clip.rename(columns={'UMAP_1': 'x', 'UMAP_2': 'y', 'arrangement_label': 'label'}, inplace=False)
lit_cmp = umap_embeddings_lit.rename(columns={'UMAP_1': 'x', 'UMAP_2': 'y', 'arrangement_label': 'label'}, inplace=False)


# Convert 'Column' to category dtype before using .cat accessor
spec_cmp['label'] = spec_cmp['label'].astype('category')
clip_cmp['label'] = clip_cmp['label'].astype('category')
lit_cmp['label'] = lit_cmp['label'].astype('category')
# bio_cmp['label'] = bio_cmp['label'].astype('category')

widget = EmbeddingComparisonWidget(
    # left_embedding=Embedding.from_df(spec_cmp),
    left_embedding=Embedding.from_df(lit_cmp),
    right_embedding=Embedding.from_df(clip_cmp),
    titles=["Embeddings from Spec", "Embeddings from Images with CLIP"],
    metric="confusion", # or 'neighborhood', 'confusion', 'abundance'
    selection="synced",
    auto_zoom=True,
    row_height=500,
)
widget

EmbeddingComparisonWidget(children=(VBox(children=(HBox(children=(WidthOptimizer(), Dropdown(description='Metr…