In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('labels.csv')

image_paths = df['filename'].tolist()
labels = df['label'].tolist()

In [3]:
len(image_paths)


1445

In [4]:
len(labels)

1445

In [5]:
from collections import Counter

In [6]:
Counter(labels)

Counter({'non-flooded': 1263, 'flooded': 182})

In [7]:
df = pd.DataFrame({
    'image_path': image_paths,
    'label': labels
})

In [8]:
len(df)

1445

In [9]:
df.sample(5)


Unnamed: 0,image_path,label
170,datasets\floodnet\FloodNet-Supervised_v1.0\tra...,non-flooded
48,datasets\floodnet\FloodNet-Supervised_v1.0\tra...,non-flooded
290,datasets\floodnet\FloodNet-Supervised_v1.0\tra...,non-flooded
463,datasets\floodnet\FloodNet-Supervised_v1.0\tra...,non-flooded
1406,datasets\floodnet\FloodNet-Supervised_v1.0\tra...,non-flooded


In [27]:
import cohere
import os
from dotenv import load_dotenv, find_dotenv
from PIL import Image
from io import BytesIO
import base64
from tqdm import tqdm

In [28]:
_= load_dotenv(find_dotenv())

In [29]:
api_key = os.getenv("COHERE_API_KEY")
model_id = os.getenv("COHERE_EMBED_MODEL_ID")
model_id

'embed-english-v3.0'

In [33]:
def image_to_base64_data_url(image_path):
    with Image.open(image_path) as img:
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    data_url = f"data:image/jpeg;base64,{img_base64}"
    return data_url

In [34]:
co = cohere.Client(api_key=api_key)

In [35]:
import pandas as pd

# CSVファイルのヘッダーを初期化（最初の1回のみ）
pd.DataFrame(columns=['image_path', 'label', 'embedding']).to_csv('image_embeddings.csv', index=False)

for i, image_path in enumerate(tqdm(image_paths)):
    data_url = image_to_base64_data_url(image_path)
    ret = co.embed(
        input_type="image",
        images=[data_url],
        model=model_id,
        embedding_types=["float"],                
    )
    # 1件分のデータをDataFrameとして作成
    embedding_df = pd.DataFrame([{
        'image_path': str(image_path),
        'label': labels[i],
        'embedding': ret.embeddings.float[0]
    }])
    
    # mode='a'（append）とheader=Falseで追記モードで保存
    embedding_df.to_csv('image_embeddings.csv', mode='a', header=False, index=False)



100%|██████████| 1445/1445 [25:08<00:00,  1.04s/it]
