In [2]:
import pandas as pd
import umap
import re

# Initialize UMAP. Reduce dimensionality to 2D for easy visualization.
# Create a UMAP instance with custom parameters

# reducer = umap.UMAP(
#     n_neighbors=50,
#     n_components=2,
#     metric='euclidean',
#     min_dist=0.5,
#     spread=0.5,
#     learning_rate=1.0,
#     n_epochs=200,
#     init='spectral',
#     random_state=29,
# )

reducer = umap.UMAP(
    n_neighbors=120,
    n_components=2,
    metric='euclidean',
    min_dist=5.5,
    spread=6.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral',
    random_state=29,
)

def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s[:-4] + "png"  # Return the whole string if no match is found

def attach_image_from_non_spec(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s


## Embeddings from Specs

In [3]:

# --------- Load Spec Data and Apply UMAP ----------

# Load the CSV file into a DataFrame
df = pd.read_csv('embeddings/spec_embeddings.csv')


# Assume 'filename' is the column to exclude from embeddings
filename_spec = df['filename']

# Extract all other columns as embeddings
embeddings_spec = df.drop('filename', axis=1)

# Apply UMAP 
umap_embeddings_spec = reducer.fit_transform(embeddings_spec)

# ----------- Apply labels -------------

# Convert the embeddings to a DataFrame
umap_embeddings_spec = pd.DataFrame(umap_embeddings_spec, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
# umap_embeddings_spec['seed'] = df['filename'].apply(extract_substring)
umap_embeddings_spec['identifier'] = df['filename'].str.split('.').str[0]
umap_embeddings_spec['url'] = df['filename'].apply(attach_image)

  return 1.0 / (1.0 + a * x ** (2 * b))
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [8]:
df_label = pd.read_csv('labels_uniform.csv')
# ------------ Merging DataFrames ------------
merged_df = pd.merge(umap_embeddings_spec, df_label, on='identifier', how='inner')  

# The merge df contains: UMAP_1, UMAP_2, seed, identifier,  url, data_label, layout_label, arrangement_label, mark_label  

In [12]:
# ----------- Config -------------

# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.5,
    "axes_grid": False,
}

# ----------- Plotting the results using jupyter scatter -----------
import jscatter

jscatter.Scatter(
    data=merged_df, x='UMAP_1', y='UMAP_2', color_by='orientation_label', **config, 
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color", "size"],
    tooltip_size="medium",
    color_map=dict(
        # data
        sparse='#009e73',          # green
        contiguous='#56b4e9',          # blue
        multi_data_types ='#e69f00', # yellow
        
        # alignment
        stack='#009e73',          # green
        overlay='#56b4e9',          # blue
        multi_alignments='#e69f00', # yellow
        
        # arrangement
        vertical='#009e73',          # green
        horizontal='#56b4e9',          # blue
        parallel='#cc79a7',         # pink
        serial='#9467bd',          # green
        multi_arrangements='#e69f00', # yellow
        no_arrangements="#cccccc",  # gray
    
         # layout
        linear='#009e73',          # green
        circular='#56b4e9',          # blue
        multi_layouts ='#e69f00', # yellow
    
        # mark
        area='#56b4e9',
            area_bar='#aec7e8',
            area_brush='#ff7f0e',
            area_line='#ffbb78',
            area_point='#2ca02c',
            area_rect='#98df8a',
            bar='#d62728',
            bar_line='#ff9896',
            bar_point='#bcbd22',
            bar_rect='#dbdb8d',
            brush_line='#8c564b',
            brush_point='#c49c94',
        line='#e377c2',
            line_point='#f7b6d2',
            line_rect='#7f7f7f',
            link_rect='#c7c7c7',
            multi_marks='#e69f00',
        point='#009e73',
            point_rect='#1f77b4',
            rect='#9467bd',
        rect_triangle='#c5b0d5',

        # orientation
        vertical_orientation='#009e73',          # green
        horizontal_orientation='#56b4e9',          # blue
        multi_orientations ='#e69f00', # yellow
    ),
).show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…