## Reading from file

In [29]:
def remove_slash(s):
    return s[1:]

In [30]:
import pandas as pd
import umap

# Specify column names
column_names = ['Filename', 'Embeddings']

# Load the CSV file
df = pd.read_csv('embeddings/image_lit_embeddings.tsv', delimiter='\t', names=column_names)
df_label = pd.read_csv('labels.csv')

# Assuming embeddings are in the second column and are stored as strings
# Example of embedding: "[0.23, 0.35, 0.11]"
import ast  # ast.literal_eval safely evaluates a string containing a Python literal expression

df['embeddings'] = df['Embeddings'].apply(ast.literal_eval)
df['Filename'] = df['Filename'].apply(remove_slash)

# Accessing the embedding for the first row
first_embedding = df.loc[0, 'embeddings']
print(first_embedding)
print(len(first_embedding))

[-0.03186718374490738, 0.04076051712036133, 0.027007782831788063, -0.010296647436916828, -0.03782013803720474, -0.011049133725464344, -0.03176485747098923, -0.03279342129826546, 0.046583134680986404, 0.03393515199422836, 0.05943172052502632, -0.07693754136562347, -0.0250400323420763, 0.030652889981865883, 0.061933547258377075, 0.018267914652824402, -0.004612755961716175, -0.028527548536658287, 0.030901411548256874, -0.032142605632543564, -0.008723047561943531, -0.002863694680854678, 0.07334578037261963, 0.013453567400574684, -0.023300861939787865, 0.006036183796823025, 0.020306995138525963, 0.014918248169124126, 0.021185200661420822, -0.009498847648501396, 0.012467663735151291, -0.04465914145112038, -0.037682775408029556, 0.018106741830706596, -0.023325452581048012, -0.0010629609460011125, 0.00416818680241704, -0.00789603404700756, 0.03416997194290161, -0.04217441380023956, -0.011883726343512535, -0.030346350744366646, -0.021483438089489937, -0.044344354420900345, -0.04094111919403076,

## Initialize UMAP

In [31]:
# Assuming 'df' is your DataFrame and it contains an 'embeddings' column with your embeddings data
# Convert embeddings list into a proper format if necessary
embeddings = list(df['embeddings'])

# Initialize UMAP. Reduce dimensionality to 2D for easy visualization.
# Create a UMAP instance with custom parameters
reducer = umap.UMAP(
    n_neighbors=50,
    n_components=2,
    metric='euclidean',
    min_dist=0.5,
    spread=0.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral'
)
# reducer = umap.UMAP(
#     n_neighbors=120,
#     n_components=2,
#     metric='euclidean',
#     min_dist=5.5,
#     spread=6.5,
#     learning_rate=1.0,
#     n_epochs=200,
#     init='spectral'
# )
umap_embeddings = reducer.fit_transform(embeddings)

## Embeddings from Images with LiT

In [32]:

#  --------- Prepare labels ---------
import re

def extract_substring(s):
    # Use a regular expression to find the point at which to stop
    match = re.search(r'_(p|m|sw|s)', s)
    if match:
        return s[:match.start()]
    return s  # Return the whole string if no match is found

def attach_image(s):    
    return "https://raw.githubusercontent.com/huyen-nguyen/spec-image-embeddings/main/screenshots/" + s  # Return the whole string if no match is found

# Apply the function to the 'Label' column
df['Label'] = df['Filename'].apply(extract_substring)

# Display the DataFrame to see the original and trimmed labels
# print(df)

# --------- Apply Labels ---------

# print(len(umap_embeddings))

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(umap_embeddings, columns=['UMAP_1', 'UMAP_2'])

# Add the labels to the DataFrame
embedding_df['Label'] = df['Label']

embedding_df["url"] = df['Filename'].apply(attach_image)
embedding_df['identifier'] = df['Filename'].str.split('.').str[0]


# ------------ Merging DataFrames ------------
merged_df = pd.merge(embedding_df, df_label, on='identifier', how='inner')  
    
# Display the first few rows of the DataFrame
# print(embedding_df.head())

# ------- Visualize --------
# API Reference: https://github.com/flekschas/jupyter-scatter
# and also https://github.com/flekschas/regl-scatterplot/#properties
config = {
    "size": 7,
    "axes_labels": True,
    "height": 800,
    "background": "dark",
    "legend": True,
    # "aspectRatio": 1,
    "opacity": 0.5,
    "axes_grid": False
}

# Plotting the results using jupyter scatter
import jscatter

jscatter.Scatter(
    data=merged_df, x='UMAP_1', y='UMAP_2', color_by='alignment_label', **config,
    tooltip=True,
    tooltip_preview="url",
    tooltip_preview_type="image",
    tooltip_preview_image_background_color="white",
    tooltip_properties=["color"],
).show()

HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width='36px'), style…