# load_data


In [1]:
import json
import os
from langchain_core.documents import Document
from datetime import datetime
from utils import load_yaml
from extract_graphstate import (
    extract_documents_for_multivectorstore,
    extract_documents_for_singlestore,
    extract_documents_for_multidocstore,
)

config = load_yaml("../config/embedding.yaml")
category_id = config["settings"]["category_id"]
filetype = config["settings"]["filetype"]
edit_path = config["settings"]["edit_path"]
# path = os.path.join(edit_path, category_id, "json")
output_path = config["settings"]["output_path"]
os.makedirs(output_path, exist_ok=True)


def load_json_files(path):
    data_list = []
    for filename in os.listdir(path):
        if filename.endswith(".json"):
            with open(os.path.join(path, filename), "r", encoding="utf-8") as file:
                data = json.load(file)
                data_list.append(data)
    return data_list


data_list = []
for category in config["settings"]["category_id"]:
    category_path = os.path.join(edit_path, category, "json")
    data_list.extend(load_json_files(category_path))


all_documents = []
for data in data_list:
    documents = extract_documents_for_singlestore(data)
    all_documents.extend(documents)

In [None]:
len(all_documents), all_documents[0].metadata

In [None]:
metakeys = []
for i in range(len(all_documents)):
    metakeys.extend(list(all_documents[i].metadata.keys()))

metadata_keys = set(metakeys)
metadata_keys

In [None]:
from langchain_teddynote.community.pinecone import preprocess_documents

contents, metadatas = preprocess_documents(
    split_docs=all_documents,
    metadata_keys=metadata_keys,
)

In [10]:
for key in list(metakeys):
    metadatas[key] = [header if header is not None else "" for header in metadatas[key]]

In [17]:
len(contents), len(metadatas["Header 1"])

(2012, 2012)

# Token 수 확인


In [3]:
from langchain_upstage import ChatUpstage
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

i = 0
for docs in all_documents:
    token = llm.get_num_tokens(docs.page_content)

    if token >= 4000:
        print(f"{docs.metadata['doc_id']}의 토큰수는 {token}개 이다")
        i += 1

print(f"토큰 수가 4000이 넘는 문서수는 {i}개 입니다")

토큰 수가 4000이 넘는 문서수는 0개 입니다


# Chroma DB 사용


In [None]:
from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from datetime import datetime
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_core.documents import Document

load_dotenv()
time = datetime.now().strftime("%Y.%m.%d")
passage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large-passage")
#passage_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

proj_name = f"{time}_single_store"
store = LocalFileStore(f"../cache/{proj_name}/data")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings=passage_embeddings,
    document_embedding_cache=store,
    namespace=passage_embeddings.model,
)

DB_PATH = f"../db/{time}_singlestore_chroma_db"

from tqdm import tqdm

BATCH_SIZE = 100

db = Chroma(persist_directory=DB_PATH, embedding_function=cached_embedder)

for i in tqdm(range(0, len(all_documents), BATCH_SIZE)):
    batch = all_documents[i : i + BATCH_SIZE]
    filtered = filter_complex_metadata(documents=batch)
    db.add_documents(filtered)

db.persist()

# Pinecone 사용


In [None]:
import os
from langchain_teddynote.community.pinecone import create_index
from dotenv import load_dotenv

load_dotenv()

pc_index = create_index(
    api_key=os.environ["PINECONE_API_KEY"],
    index_name="globalmacro-chatbot",
    dimension=4096,  #  (OpenAIEmbeddings: 1536, UpstageEmbeddings: 4096)
    metric="dotproduct",  # (dotproduct, euclidean, cosine)
)

In [None]:
from langchain_teddynote.community.pinecone import (
    create_sparse_encoder,
    fit_sparse_encoder,
)
from langchain_teddynote.korean import stopwords

sparse_encoder = create_sparse_encoder(stopwords(), mode="kiwi")

saved_path = fit_sparse_encoder(
    sparse_encoder=sparse_encoder,
    contents=contents,
    save_path="../data/sparse_encoder_01.pkl",
)

In [14]:
from typing import List, Dict, Any, Optional, Tuple
from pinecone_text.sparse import BM25Encoder
from langchain_core.embeddings import Embeddings
import secrets
import os


def generate_hash() -> str:
    """24자리 무작위 hex 값을 생성하고 6자리씩 나누어 '-'로 연결합니다."""
    random_hex = secrets.token_hex(12)
    return "-".join(random_hex[i : i + 6] for i in range(0, 24, 6))


def upsert_documents(
    index: Any,
    namespace: str,
    contents: List[str],
    metadatas: List[Dict],
    sparse_encoder: BM25Encoder,
    embedder: Embeddings,
    batch_size: int = 32,
):
    load_dotenv()
    ids = [generate_hash() for _ in range(len(contents))]

    for i in range(0, len(contents), batch_size):
        batch_contents = contents[i : i + batch_size]
        batch_metadatas = metadatas[i : i + batch_size]
        batch_ids = ids[i : i + batch_size]

        dense_embeds = embedder.embed_documents(batch_contents)
        sparse_embeds = sparse_encoder.encode_documents(batch_contents)

        vectors = [
            {
                "id": _id,
                "sparse_values": sparse,
                "values": dense,
                "metadata": metadata,
            }
            for _id, sparse, dense, metadata in zip(
                batch_ids, sparse_embeds, dense_embeds, batch_metadatas
            )
        ]

        index.upsert(vectors=vectors, namespace=namespace)

        print(f"[upsert_documents] 배치 {i//batch_size + 1} 완료")

    print(f"[upsert_documents]\n{index.describe_index_stats()}")

In [None]:
from langchain_upstage import UpstageEmbeddings
import os
from dotenv import load_dotenv
from langchain_teddynote.community.pinecone import load_sparse_encoder

load_dotenv()

sparse_encoder = load_sparse_encoder("../data/sparse_encoder_01.pkl")
upstage_embeddings = UpstageEmbeddings(
    model="solar-embedding-1-large-passage", api_key=os.environ["UPSTAGE_API_KEY"]
)


upsert_documents(
    index=pc_index,
    namespace="financical-data-01",
    contents=contents,
    metadatas=metadatas,
    sparse_encoder=sparse_encoder,
    embedder=upstage_embeddings,
)

In [None]:
from langchain_teddynote.community.pinecone import upsert_documents_parallel
from langchain_upstage import UpstageEmbeddings
import os
from dotenv import load_dotenv
from langchain_teddynote.community.pinecone import load_sparse_encoder

load_dotenv()

sparse_encoder = load_sparse_encoder("../data/sparse_encoder_01.pkl")
upstage_embeddings = UpstageEmbeddings(
    model="solar-embedding-1-large-passage", api_key=os.environ["UPSTAGE_API_KEY"]
)
upsert_documents_parallel(
    index=pc_index,
    namespace="financical-data-00",
    contents=contents,
    metadatas=metadatas,
    sparse_encoder=sparse_encoder,
    embedder=upstage_embeddings,
    batch_size=32,
    max_workers=30,
)