In [22]:
import os
from openai import AzureOpenAI  # Azure OpenAI のクライアントクラス
import numpy as np             # 数値計算ライブラリ（ベクトル演算など）
from dotenv import load_dotenv
import pandas as pd

In [8]:
# Load environment variables from .env file
load_dotenv()

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
API_VERSION = os.getenv("API_VERSION")
DEPLOYMENT_ID_FOR_CHAT_COMPLETION = os.getenv("DEPLOYMENT_ID_FOR_CHAT_COMPLETION")
DEPLOYMENT_ID_FOR_EMBEDDING = os.getenv("DEPLOYMENT_ID_FOR_EMBEDDING")

# 【1-3】AzureOpenAI のクライアントを初期化
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=API_VERSION
)


In [17]:
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

In [9]:
#フォルダパス内のファイルパスを取得する関数
def get_file_paths(folder_path):
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

In [21]:
#set_path

pdf_folder_path  = "pdf/"
text_folder_path = "text/"
csv_folder_path  = "csv/"

pdf_text_folder_path = text_folder_path + "pdf_text/"
pdf_summarize_text_folder_path = text_folder_path + "pdf_summarize_text/"

In [11]:
# get_path
text_paths = get_file_paths(pdf_summarize_text_folder_path)

In [23]:
total_info = []
for text_path in text_paths:
    file_name = get_filename_without_extension(text_path)

    #textの読込
    with open(text_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Azure OpenAI Service の Embedding APIを呼び出し
    response = client.embeddings.create(
        input=text,                  # Embeddingしたいテキスト
        model=DEPLOYMENT_ID_FOR_EMBEDDING  # 使用するEmbeddingモデル（デプロイ名）
    )
    embedding_vector = response.data[0].embedding

    total_info.append( [text_path ,embedding_vector])

embedding_vector_df = pd.DataFrame(total_info,columns=["file_path","embedding"])


In [25]:
embedding_vector_df.to_csv(csv_folder_path+"vector.csv")