In [61]:
import json, glob
from dotenv import load_dotenv, find_dotenv; _ = load_dotenv(find_dotenv())

In [79]:
from langchain_openai.embeddings import OpenAIEmbeddings
embedding_2 = OpenAIEmbeddings(model='text-embedding-ada-002')
embedding_3_small = OpenAIEmbeddings(model='text-embedding-3-small')
embedding_3_large = OpenAIEmbeddings(model='text-embedding-3-large')
# print(embedding_model.embed_query("Hello world"))
# print(embedding_model.embed_documents(["Hello world", "Goodbye world"]))

In [64]:
topic_type = 'foods'
topic_file = f'{topic_type}.txt'

In [65]:
with open(f'config/{topic_file}', 'r') as f:
    topic_list = f.read().splitlines()
topic_list

['じゃがいも', '玉ねぎ', 'にんじん', 'なす', 'トマト', 'ピーマン']

In [66]:
files = [
    file
    for topic in topic_list for file in glob.glob(f'./data/{topic_type}/{topic}/*.txt')
]
len(files)

450

In [67]:
files[0].split('/')[-1].split('.')[0].split('-')

['en', 'prose', 'long_sum']

In [107]:
data = []
for file in files:
    tree = file.split('/')
    topic = tree[-2]
    lang, format_type, length_type = tree[-1].split('.')[0].split('-')
    # print(topic, lang, format_type, length_type)
    data.append({
        'topic': topic,
        'lang': lang,
        'format': format_type,
        'length': length_type,
        'text': open(file,'r').read()
    })

In [108]:
texts = [d['text'] for d in data]
embeddings_2 = embedding_2.embed_documents(texts)
embeddings_3_small = embedding_3_small.embed_documents(texts)
embeddings_3_large = embedding_3_large.embed_documents(texts)

In [109]:
for i, d in enumerate(data):
    data[i]['embedding_2'] = embeddings_2[i]
    data[i]['embedding_3_small'] = embeddings_3_small[i]
    data[i]['embedding_3_large'] = embeddings_3_large[i]

In [110]:
# data[0]

In [290]:
import pandas as pd
from sklearn.datasets import make_blobs
import umap
import plotly.express as px

params = {
    'n_neighbors': 400,
    'min_dist': 0.2,
    'metric': 'cosine',
    # 'metric': 'euclidean',
}

In [291]:
umap_model = umap.UMAP(n_components=2, **params)
embeddings_2_2d = umap_model.fit_transform(embeddings_2)
embeddings_3s_2d = umap_model.fit_transform(embeddings_3_small)
embeddings_3l_2d = umap_model.fit_transform(embeddings_3_large)

In [292]:
type(embeddings_2_2d)

numpy.ndarray

In [293]:
for i,d in enumerate(data):
    data[i]['x_2'] = embeddings_2_2d[i][0]
    data[i]['y_2'] = embeddings_2_2d[i][1]
    data[i]['x_3s'] = embeddings_3s_2d[i][0]
    data[i]['y_3s'] = embeddings_3s_2d[i][1]
    data[i]['x_3l'] = embeddings_3l_2d[i][0]
    data[i]['y_3l'] = embeddings_3l_2d[i][1]
df = pd.DataFrame(data)
df.to_csv('all_data.csv')

In [294]:
def display(embedding_model, color):
    x = ''
    y = ''
    if embedding_model == '2':
        x = 'x_2'
        y = 'y_2'
    elif embedding_model == '3s':
        x = 'x_3s'
        y = 'y_3s'
    elif embedding_model == '3l':
        x = 'x_3l'
        y = 'y_3l'
    else:
        print('Error')
    fig = px.scatter(
        df, x=x, y=y,
        color=color,
        hover_data=['topic','lang','format','length'],
        opacity=0.5
    )
    fig.show()

### ada-002

#### 食材

In [295]:
display('2','topic')

#### 形式

In [296]:
display('2','format')

### 言語

In [297]:
display('2','lang')

### 3-small

In [298]:
display('3s','topic')

In [299]:
display('3s','format')

In [300]:
display('3s','lang')

### 3-large

In [301]:
display('3l','topic')

In [302]:
display('3l','format')

In [303]:
display('3l','lang')