## Reading from file

In [62]:
import pandas as pd
import umap

# Load the CSV file
df = pd.read_csv('embeddings/image_clip_embeddings.csv')
df_label = pd.read_csv('labels.csv')

# Assuming embeddings are in the second column and are stored as strings
# Example of embedding: "[0.23, 0.35, 0.11]"
import ast  # ast.literal_eval safely evaluates a string containing a Python literal expression

df['embeddings'] = df['Embeddings'].apply(ast.literal_eval)
# Accessing the embedding for the first row
first_embedding = df.loc[0, 'embeddings']
# print(first_embedding)
# print(len(first_embedding))

## Initialize UMAP

In [65]:
# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df['embeddings'])

# Initialize UMAP. Reduce dimensionality to 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=50,
    n_components=2,
    metric='euclidean',
    min_dist=0.5,
    spread=0.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral'
)
# reducer = umap.UMAP(
#     n_neighbors=120,
#     n_components=2,
#     metric='euclidean',
#     min_dist=5.5,
#     spread=6.5,
#     learning_rate=1.0,
#     n_epochs=200,
#     init='spectral'
# )
umap_embeddings = reducer.fit_transform(embeddings)

## Embeddings from Images with CLIP

In [66]:

#  --------- Prepare labels ---------
import re

def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s  # Return the whole string if no match is found

# Apply the function to the 'Label' column
df['Label'] = df['Filename'].apply(extract_substring)

# Display the DataFrame to see the original and trimmed labels
# print(df)

# --------- Apply Labels ---------

# print(len(umap_embeddings))

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(umap_embeddings, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
embedding_df['Label'] = df['Label']

embedding_df["url"] = df['Filename'].apply(attach_image)
embedding_df['identifier'] = df['Filename'].str.split('.').str[0]
    
# Display the first few rows of the DataFrame
# print(embedding_df.head())

# ------------ Merging DataFrames ------------
merged_df = pd.merge(embedding_df, df_label, on='identifier', how='inner')  

# ------- Visualize --------
# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.5,
    "axes_grid": False
}

# Plotting the results using jupyter scatter
import jscatter

jscatter.Scatter(
    data=merged_df, x='UMAP_1', y='UMAP_2', color_by='alignment_label', **config,
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color"],
    tooltip_size="medium",
).show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…