In [200]:
import pandas as pd
import umap

# Initialize UMAP. Reduce dimensionality to 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=120,
    n_components=2,
    metric='euclidean',
    min_dist=5.5,
    spread=6.0,
    learning_rate=1.0,
    n_epochs=200,
    init='random'
)

## Embeddings from Specs

In [201]:

# --------- Load Spec Data and Apply UMAP ----------

# Load the CSV file into a DataFrame
df = pd.read_csv('embeddings/spec_embeddings_no_track_no_view.csv')
df_label = pd.read_csv('labels.csv')

# Assume 'filename' is the column to exclude from embeddings
filename_spec = df['filename']

# Extract all other columns as embeddings
embeddings_spec = df.drop('filename', axis=1)

# Apply UMAP 
umap_embeddings_spec = reducer.fit_transform(embeddings_spec)


# --------- Prepare labels ---------

import re
def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s[:-4] + "png"  # Return the whole string if no match is found

def attach_image_from_non_spec(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s

# Apply the function to the 'Label' column
df['seed'] = df['filename'].apply(extract_substring)

# ----------- Apply labels -------------

# Convert the embeddings to a DataFrame
umap_embeddings_spec = pd.DataFrame(umap_embeddings_spec, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
umap_embeddings_spec['seed'] = df['seed']
umap_embeddings_spec['identifier'] = df['filename'].str.split('.').str[0]
umap_embeddings_spec['url'] = df['filename'].apply(attach_image)

# ------------ Merging DataFrames ------------
merged_df = pd.merge(umap_embeddings_spec, df_label, on='identifier', how='inner')  

# The merge df contains: UMAP_1, UMAP_2, seed, identifier,  url, data_label, layout_label, arrangement_label, mark_label  
# Display the first few rows of the DataFrame
# print(merged_df.head())

# ----------- Config -------------

# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.5,
    "axes_grid": True
}

# ----------- Plotting the results using jupyter scatter -----------
import jscatter

jscatter.Scatter(
    data=merged_df, x='UMAP_1', y='UMAP_2', color_by='alignment_label', **config, 
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color", "size"],
    tooltip_size="medium",
).show()

  return 1.0 / (1.0 + a * x ** (2 * b))


HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…