In [1]:
import pandas as pd
import umap
import re
import jscatter
# Assuming embeddings are in the second column and are stored as strings
# Example of embedding: "[0.23, 0.35, 0.11]"
import ast  # ast.literal_eval safely evaluates a string containing a Python literal expression

# Initialize UMAP. Reduce dimensionality t 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=50,
    n_components=2,
    metric='euclidean',
    min_dist=0.5,
    spread=0.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral'
)

def remove_slash(s):
    return s[1:]

def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s[:-4] + "png"  # Return the whole string if no match is found

def attach_image_from_non_spec(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s  # Return the whole string if no match is found

## Embeddings from Specs, LiT, and CLIP

In [5]:
# --------- Load Spec Data and Apply UMAP ----------

# Load the CSV file into a DataFrame
df_spec = pd.read_csv('embeddings/spec_embeddings.csv')

# Assume 'filename' is the column to exclude from embeddings
filename_spec = df_spec['filename']

# Extract all other columns as embeddings
embeddings_spec = df_spec.drop('filename', axis=1)

# Apply UMAP 
umap_embeddings_spec = reducer.fit_transform(embeddings_spec)

# --------- Load LiT Data and Apply UMAP ----------

# Specify column names
column_names = ['Filename', 'Embeddings']

# Load the CSV file
df_lit = pd.read_csv('embeddings/image_lit_embeddings.tsv', delimiter='\t', names=column_names)

df_lit['embeddings'] = df_lit['Embeddings'].apply(ast.literal_eval)
df_lit['filename'] = df_lit['Filename'].apply(remove_slash)

# Accessing the embedding for the first row
first_embedding = df_lit.loc[0, 'embeddings']

# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df_lit['embeddings'])

umap_embeddings_lit = reducer.fit_transform(embeddings)

# --------- Load CLIP Data and Apply UMAP ----------

# Load the CSV file
df_clip = pd.read_csv('embeddings/image_clip_embeddings.csv')

df_clip['embeddings'] = df_clip['Embeddings'].apply(ast.literal_eval)
# Accessing the embedding for the first row
first_embedding = df_clip.loc[0, 'embeddings']

# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df_clip['embeddings'])

umap_embeddings_clip = reducer.fit_transform(embeddings)

# --------- Prepare labels ---------

# Apply the function to the 'Label' column
df_spec['label'] = df_spec['filename'].apply(extract_substring)
df_lit['label'] = df_lit['filename'].apply(extract_substring)
df_clip['label'] = df_clip['Filename'].apply(extract_substring)

# Display the DataFrame to see the original and trimmed labels
# print(df)

# ----------- Apply labels -------------

# Convert the embeddings to a DataFrame
umap_embeddings_spec = pd.DataFrame(umap_embeddings_spec, columns=['UMAP_1', 'UMAP_2'])
umap_embeddings_lit = pd.DataFrame(umap_embeddings_lit, columns=['UMAP_1', 'UMAP_2'])
umap_embeddings_clip = pd.DataFrame(umap_embeddings_clip, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
umap_embeddings_spec['Label'] = df_spec['label']
umap_embeddings_lit['Label'] = df_lit['label']
umap_embeddings_clip['Label'] = df_clip['label']

umap_embeddings_spec["url"] = df_spec['filename'].apply(attach_image)
umap_embeddings_lit["url"] = df_lit['filename'].apply(attach_image_from_non_spec)
umap_embeddings_clip["url"] = df_clip['Filename'].apply(attach_image_from_non_spec)

# ----------- Combine -------------

combined = umap_embeddings_spec.merge(umap_embeddings_lit, on=['Label', 'url'], how='left').merge(umap_embeddings_clip, on=['Label', 'url'], how='left')
combined = combined.rename(columns={"UMAP_1_x": "UMAP_1_spec", "UMAP_2_x": "UMAP_2_spec", "UMAP_1_y": "UMAP_1_lit", "UMAP_2_y": "UMAP_2_lit", "UMAP_1": "UMAP_1_clip", "UMAP_2": "UMAP_2_clip"})
combined

# Display the first few rows of the DataFrame
# print(umap_embeddings_spec.head())

# ----------- Config -------------

# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "color_by": 'Label',
    "size": 7,
    "axes_labels": True,
    "height": 1000,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.8,
    "axes_grid": True,
    "tooltip": True,
    "tooltip_preview": "url",
    "tooltip_preview_type": "image",
    "tooltip_preview_image_background_color": "white",
    "tooltip_properties": ["color"],
    "data": combined
}

# ----------- Plotting the results using jupyter scatter -----------
jscatter.compose(
    [
        jscatter.Scatter(
            x="UMAP_1_spec", y="UMAP_2_spec", **config,
        ),
        jscatter.Scatter(
            x="UMAP_1_lit", y="UMAP_2_lit", **config,
        ),
        jscatter.Scatter(
            x="UMAP_1_clip", y="UMAP_2_clip", **config
        )
    ],
    sync_selection=True,
    sync_hover=True,
    rows=1
)

GridBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(wi…

In [None]:
len(umap_embeddings_lit)