In [1]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv; _ = load_dotenv(find_dotenv())

In [2]:
topic_type = 'foods'
topic_file = f'{topic_type}.txt'

In [3]:
df = pd.read_csv(f'data/{topic_type}/data.csv',index_col=0, dtype=str)
df['embedding_2'] = [eval(e) for e in df['embedding_2']]
df['embedding_3_small'] = [eval(e) for e in df['embedding_3_small']]
df['embedding_3_large'] = [eval(e) for e in df['embedding_3_large']]
print(len(df))
df.head(1)

450


Unnamed: 0,topic,lang,format,length,text,embedding_2,embedding_3_small,embedding_3_large
0,じゃがいも,en,prose,long_sum,Potatoes are a vegetable native to South Ameri...,"[0.017380426068302725, -0.018543428196079303, ...","[-0.006245624336255771, -0.04292074686214286, ...","[0.012517579007594287, -0.04335306299595459, -..."


In [4]:
import umap
import plotly.express as px

params = {
    'n_neighbors': 400,
    'min_dist': 0.2,
    'metric': 'cosine',
    # 'metric': 'euclidean',
}

In [5]:
embeddings_2 = df['embedding_2'].tolist()
embeddings_3_small = df['embedding_3_small'].tolist()
embeddings_3_large = df['embedding_3_large'].tolist()

In [6]:
umap_model = umap.UMAP(n_components=2, **params)
embeddings_2_2d = umap_model.fit_transform(embeddings_2)
embeddings_3s_2d = umap_model.fit_transform(embeddings_3_small)
embeddings_3l_2d = umap_model.fit_transform(embeddings_3_large)

In [7]:
type(embeddings_2_2d)

numpy.ndarray

In [8]:
for i,id in enumerate(df.index):
    df.loc[id,'x_2'] = embeddings_2_2d[i][0]
    df.loc[id,'y_2'] = embeddings_2_2d[i][1]
    df.loc[id,'x_3s'] = embeddings_3s_2d[i][0]
    df.loc[id,'y_3s'] = embeddings_3s_2d[i][1]
    df.loc[id,'x_3l'] = embeddings_3l_2d[i][0]
    df.loc[id,'y_3l'] = embeddings_3l_2d[i][1]
df.to_csv('all_data.csv')

In [52]:
def save_combined(embedding_models, colors):
    emb_model_dict = {
        '2': 'text-embedding-ada-002',
        '3s': 'text-embedding-3-small',
        '3l': 'text-embedding-3-large'
    }
    figs = []
    for color in colors:
        for embedding_model in embedding_models:
            x = f'x_{embedding_model}'
            y = f'y_{embedding_model}'
            fig = px.scatter(
                df, x=x, y=y,
                color=color,
                hover_data=['topic','lang','format','length'],
                opacity=0.5,
                title=f'UMAP {embedding_model} colored by {color}'
            )
            figs.append(fig)

    combined_fig = make_subplots(
        rows=len(colors), cols=len(embedding_models) + 1,  # 列の数を1つ増やす
        subplot_titles=['']+[
            f'{emb_model_dict[embedding_model]}'
            if i == 0 else ''
            for i, color in enumerate(colors) for embedding_model in embedding_models
        ],
    )

    for i, fig in enumerate(figs, start=1):
        row = ((i - 1) // len(embedding_models)) + 1
        col = ((i - 1) % len(embedding_models)) + 2  # 列のインデックスを調整
        for trace in fig.data:
            combined_fig.add_trace(trace, row=row, col=col)

    # 一番右のサブプロットのlegendだけ表示し、重複を避ける
    for i in range(len(colors)):
        for j in range(len(embedding_models)):
            if j == len(embedding_models) - 1:
                combined_fig.update_traces(showlegend=True, selector=dict(name=colors[i]), row=i+1, col=j+2)  # 列のインデックスを調整
            else:
                combined_fig.update_traces(showlegend=False, row=i+1, col=j+2)  # 列のインデックスを調整

    combined_fig.update_layout(height=300*len(colors))  # グラフの高さを調整
    combined_fig.update_layout(showaxes=False)
    combined_fig.write_html(f'output/{topic_type}/combined_scatter.html')
    combined_fig.write_image(f'output/{topic_type}/combined_scatter.png')

In [64]:
def save(embedding_model, color):
    x = ''
    y = ''
    if embedding_model == '2':
        x = 'x_2'
        y = 'y_2'
    elif embedding_model == '3s':
        x = 'x_3s'
        y = 'y_3s'
    elif embedding_model == '3l':
        x = 'x_3l'
        y = 'y_3l'
    else:
        print('Error')
    fig = px.scatter(
        df, x=x, y=y,
        color=color,
        hover_data=['topic','lang','format','length'],
        opacity=0,
    )
    fig.write_html(f'output/{topic_type}/{embedding_model}_{color}_scatter.html')
    fig.write_image(f'output/{topic_type}/{embedding_model}_{color}_scatter.png')

### ada-002

In [65]:
save('2','topic')

ValueError: Value of 'size' is not the name of a column in 'data_frame'. Expected one of ['topic', 'lang', 'format', 'length', 'text', 'embedding_2', 'embedding_3_small', 'embedding_3_large', 'x_2', 'y_2', 'x_3s', 'y_3s', 'x_3l', 'y_3l'] but received: 8

In [55]:
save('2','format')

In [56]:
save('2','lang')

### 3-small

In [57]:
save('3s','topic')

In [58]:
save('3s','format')

In [59]:
save('3s','lang')

### 3-large

In [60]:
save('3l','topic')

In [61]:
save('3l','format')

In [62]:
save('3l','lang')

In [63]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure

# Define a function to create and save a grid of scatter plots for the embeddings
def save_embeddings_grid(df, file_name):
    fig, axs = plt.subplots(3, 4, figsize=(20, 15))  # 3 models x 4 aspects
    models = ['2', '3s', '3l']
    aspects = ['topic', 'format', 'lang', 'length']
    
    for i, model in enumerate(models):
        for j, aspect in enumerate(aspects):
            axs[i, j].scatter(df[f'x_{model}'], df[f'y_{model}'])
            axs[i, j].set_title(f'Model {model.upper()} - {aspect.capitalize()}')
            axs[i, j].set_xlabel('x')
            axs[i, j].set_ylabel('y')
    
    plt.tight_layout()
    plt.savefig(f'output/{file_name}.png')  # Save the figure as a PNG image
    plt.savefig(f'output/{file_name}.html')  # Save the figure as an HTML file
    plt.close(fig)

# Load the dataframe with the embeddings and their 2D UMAP projections
df = pd.read_csv('data/foods/data.csv')

# Save the grid of scatter plots to files
save_embeddings_grid(df, 'embeddings_grid')

ModuleNotFoundError: No module named 'matplotlib'