In [1]:
import pandas as pd
import numpy as np
import ast

In [None]:
# CSVファイルを読み込む
df_image_embeddings = pd.read_csv('image_embeddings.csv')

# 文字列として保存された埋め込みベクトルをnumpy配列に変換
df_image_embeddings['embedding'] = df_image_embeddings['embedding'].apply(lambda x: np.array(ast.literal_eval(x)))

# 確認
print(f"データ数: {len(df_image_embeddings)}")
print(f"埋め込みベクトルの次元: {len(df_image_embeddings['embedding'].iloc[0])}")

In [None]:
# データの確認
display(df_image_embeddings.head().style
    .set_properties(**{'text-align': 'left'})
    .set_table_styles([
        {'selector': 'th', 'props': [('text-align', 'left'), ('font-weight', 'bold'), ('white-space', 'nowrap')]},
        {'selector': '.row_heading, .blank', 'props': [('display', 'none')]},
        {'selector': 'td', 'props': [('padding', '5px')]}
    ]))

In [None]:
# ラベルの分布確認
print(df_image_embeddings['label'].value_counts())

In [5]:
class_captions = [
    'A satellite image of a non-flooded area of land.',
    'A satellite image of a flooded area of land.'
    ]


In [6]:
class_labels = [
    'non-flooded',
    'flooded'
    ]

In [7]:
import cohere
import os
from dotenv import load_dotenv, find_dotenv

In [8]:
_= load_dotenv(find_dotenv())

In [None]:
api_key = os.getenv("COHERE_API_KEY")
model_id = os.getenv("COHERE_EMBED_MODEL_ID")
model_id

In [10]:
co = cohere.Client(api_key=api_key)

In [11]:
ret = co.embed(
    input_type="classification",
    texts=class_captions,
    model=model_id,
    embedding_types=["float"],
)
class_embeddings = np.array(ret.embeddings.float)

In [None]:
class_embeddings.shape

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.special import softmax

In [14]:
# 画像の埋め込みベクトルをスタック
image_embeddings = np.stack(df_image_embeddings['embedding'].values)

# コサイン類似度を計算
similarities = cosine_similarity(image_embeddings, class_embeddings)

# softmaxで確率に変換
probabilities = softmax(similarities, axis=1)

# 予測クラスを取得（0: non-flooded, 1: flooded）
predicted_classes = np.argmax(probabilities, axis=1)

# 結果をデータフレームに追加
df_image_embeddings['predicted_label'] = [class_labels[i] for i in predicted_classes]
df_image_embeddings['confidence'] = np.max(probabilities, axis=1)

In [None]:
# 結果の確認
display(df_image_embeddings.head().style
    .set_properties(**{'text-align': 'left'})
    .set_table_styles([
        {'selector': 'th', 'props': [('text-align', 'left'), ('font-weight', 'bold'), ('white-space', 'nowrap')]},
        {'selector': '.row_heading, .blank', 'props': [('display', 'none')]},
        {'selector': 'td', 'props': [('padding', '5px')]}
    ]))

In [None]:
# 正解率の確認
accuracy = (df_image_embeddings['label'] == df_image_embeddings['predicted_label']).mean()
print(f"\n正解率: {accuracy:.3f}")

In [None]:
# 混同行列の要素を計算
true_positives = ((df_image_embeddings['label'] == 'flooded') & 
                 (df_image_embeddings['predicted_label'] == 'flooded')).sum()
true_negatives = ((df_image_embeddings['label'] == 'non-flooded') & 
                  (df_image_embeddings['predicted_label'] == 'non-flooded')).sum()
false_positives = ((df_image_embeddings['label'] == 'non-flooded') & 
                  (df_image_embeddings['predicted_label'] == 'flooded')).sum()
false_negatives = ((df_image_embeddings['label'] == 'flooded') & 
                  (df_image_embeddings['predicted_label'] == 'non-flooded')).sum()

# 適合率（Precision）= TP / (TP + FP)
precision = true_positives / (true_positives + false_positives)

# 再現率（Recall）= TP / (TP + FN)
recall = true_positives / (true_positives + false_negatives)

# F1スコア = 2 * (precision * recall) / (precision + recall)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"適合率（Precision）: {precision:.3f}")
print(f"再現率（Recall）: {recall:.3f}")
print(f"F1スコア: {f1_score:.3f}")
print("\n詳細:")
print(f"True Positives (正しく浸水と予測): {true_positives}")
print(f"True Negatives (正しく非浸水と予測): {true_negatives}")
print(f"False Positives (誤って浸水と予測): {false_positives}")
print(f"False Negatives (浸水を見逃した数): {false_negatives}")
print(f"実際の浸水画像の総数: {true_positives + false_negatives}")
print(f"実際の非浸水画像の総数: {true_negatives + false_positives}")
print(f"総数: {len(df_image_embeddings)}")

In [18]:
columns_to_save = ['image_path', 'label', 'predicted_label', 'confidence']
df_image_embeddings[columns_to_save].to_csv('classified.csv', index=False)