In [1]:
# !pip install pyarrow umap-learn seaborn altair
import os
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

In [2]:
# embeddings data
data_dir = ".."

# actual data repo
data_repo = "../../../data"

In [3]:
with open("prefixes.txt", "r") as f:
    prefixes = [line.rstrip() for line in f]

df_groups = pd.read_csv("class_group.csv")
df_groups.head()

df_groups.set_index("class", inplace=True)
class_to_group = df_groups.to_dict()['group']

In [4]:
reducer = umap.UMAP(
    n_neighbors=120,
    n_components=2,
    metric='euclidean',
    min_dist=5.5,
    spread=6.5,
    learning_rate=1.0,
    n_epochs=200,
    init='spectral',
    random_state=29,
)

In [None]:
def get_embeddings(parquet_file, reducer):
    df = pd.read_parquet(os.path.join(data_dir, parquet_file))
    embedding_cols = df.drop('file_name', axis=1)
    umap = reducer.fit_transform(embedding_cols)
    df_umap = pd.DataFrame(umap, columns=['UMAP_1', 'UMAP_2'])
    df_umap['file_name'] = df['file_name'].str.split('.').str[0]
    df_umap['class'] = df_umap['file_name'].apply(lambda x: next((pre for pre in prefixes if x.startswith(pre)), None))
    return df_umap

In [6]:
# df_umap_image = get_embeddings("image_embeddings.parquet", reducer)
# df_umap_image.to_csv("umap_image.csv")

In [7]:
# df_umap_text = get_embeddings("text_embeddings.parquet", reducer)
# df_umap_text.to_csv("umap_text.csv")

In [8]:
# df_umap_text = get_embeddings("text_0_2_4_embeddings.parquet", reducer)
# df_umap_text.to_csv("umap_text_0_2_4.csv")

In [9]:
# df_umap_text = get_embeddings("text_0_2_4_llm_fs_single_embeddings.parquet", reducer)
# df_umap_text.to_csv("umap_text_0_2_4_llm_fs_single.csv")

In [10]:
# df_umap_spec_freq = get_embeddings("spec_frequency.parquet", reducer)
# df_umap_spec_freq.to_csv("umap_spec_freq.csv")

In [11]:
# df_umap_spec_oh = get_embeddings("spec_onehot.parquet", reducer)
# df_umap_spec_oh.to_csv("umap_spec_oh.csv")

In [12]:
df_umap_image = pd.read_csv("umap_image.csv", index_col=0)
df_umap_text = pd.read_csv("umap_text_0_2_2.csv", index_col=0)
df_umap_text_llm = pd.read_csv("umap_text_0_2_4_llm_fs_single.csv", index_col=0)
df_umap_spec_freq = pd.read_csv("umap_spec_freq.csv", index_col=0)
df_umap_spec_oh = pd.read_csv("umap_spec_oh.csv", index_col=0)

In [13]:
umaps = [df_umap_spec_freq, df_umap_spec_oh, df_umap_text, df_umap_text_llm, df_umap_image]
titles = ["Spec Frequency Embeddings", "Spec One-Hot Embeddings", "Text Embeddings", "Text+LLM Embeddings", "Image Embeddings"]

index_single = [f.split(".png")[0] for f in os.listdir(os.path.join(data_repo, "indexed", "single_chart"))]
index_multiple = [f.split(".png")[0] for f in os.listdir(os.path.join(data_repo, "indexed", "multiple_chart/imgs"))]

gs1 = []
gs2 = []

for i, umap in enumerate(umaps):
    umap.loc[umap['class'].isin(['overview-landing_cc_0', 'overview-landing_oc']), 'class'] = 'overview-landing'

    umap['group'] = umap['class'].map(class_to_group)
    umap['indexed'] = umap['file_name'].apply(lambda x: 'single' if x in index_single else 'multiple' if x in index_multiple else None)

    g1 = alt.Chart(umap).mark_point(filled=True).encode(
        x=alt.X('UMAP_1', title='UMAP 1'),
        y=alt.Y('UMAP_2', title='UMAP 2'),
        color=alt.Color('group:N', sort=['simple', 'linear', 'circular', 'matrix', 'ideogram', 'gene_annotation', 'chromoscope', 'complex']),
        tooltip=['group:N']
    ).properties(
        width=400,
        height=400,
        title=titles[i],
    )#.interactive()

    gs1.append(g1)

    domain = [None, 'single', 'multiple']
    range_ = ['lightgray', 'seagreen', 'blue']
    range2_ = ['circle', 'square', 'cross']

    g2 = alt.Chart(umap).mark_point(filled=True).encode(
        x=alt.X('UMAP_1', title='UMAP 1'),
        y=alt.Y('UMAP_2', title='UMAP 2'),
        color=alt.Color('indexed:N').scale(domain=domain, range=range_),
        shape=alt.Shape('indexed:N').scale(domain=domain, range=range2_),
        tooltip=['class']
    ).properties(
        width=400,
        height=400,
        title=titles[i],
    )#.interactive()

    gs2.append(g2)


In [14]:
combined = gs1[0]
for g in gs1[1:]:
    combined = combined | g

combined

In [15]:
combined2 = gs2[0]
for g in gs2[1:]:
    combined2 = combined2 | g

combined2

In [16]:
# !pip install vl-convert-python
combined.save('umap-1.svg', scale=10)
combined2.save('umap-2.svg', scale=10)