In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader

pdf_docs = PyPDFLoader("../data/Sustainability_report_2024_kr.pdf").load()

In [3]:
len(pdf_docs)

83

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

rec_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
chunk_docs = rec_splitter.split_documents(pdf_docs)
len(chunk_docs)

207

In [5]:
chunk_docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')

In [6]:
for item in chunk_docs:
    item.metadata = {**(item.metadata), "class" : "wanted"}

In [7]:
chunk_docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')

# Fiass 벡터 DB 생성

In [8]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [10]:
dim_size = len(embeddings.embed_query("안녕하세요"))
print(dim_size)

3072


In [None]:
# uv add faiss-cpu

In [9]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [21]:
db = FAISS.from_documents(
    documents=[chunk_docs[0], chunk_docs[1]],
    embedding=embeddings,
    # ids = ['문서1', '문서2']
)
db.index_to_docstore_id

{0: 'a56e3877-4efc-4a9c-985e-cdd953d789b8',
 1: '2cc269b3-1c85-4ccd-b148-588aceaa5df5'}

In [22]:
db.docstore.__dict__['_dict']

{'a56e3877-4efc-4a9c-985e-cdd953d789b8': Document(id='a56e3877-4efc-4a9c-985e-cdd953d789b8', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024'),
 '2cc269b3-1c85-4ccd-b148-588aceaa5df5': Document(id='2cc269b3-1c85-4ccd-b148-588aceaa5df5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 1, 'page_label': '2', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024\nCEO 메시지\

In [23]:
db.similarity_search("삼성", k=5)

[Document(id='a56e3877-4efc-4a9c-985e-cdd953d789b8', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024'),
 Document(id='2cc269b3-1c85-4ccd-b148-588aceaa5df5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 1, 'page_label': '2', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024\nCEO 메시지\n회사 소개\n이해관계자 소통\nOur Company\n04\n05\n06\n준법과 윤리경영\nPrinciple\n53\n중대성 평가\nMate

In [26]:
vectorstore_db_path = "../vectorstore/samsung_faiss.db"
index_name = "samsung2025"
db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

In [28]:
# 저장된 db 불러오기
load_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings = embeddings,
    allow_dangerous_deserialization = True
)

In [29]:
load_db.docstore.__dict__["_dict"]

{'a56e3877-4efc-4a9c-985e-cdd953d789b8': Document(id='a56e3877-4efc-4a9c-985e-cdd953d789b8', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024'),
 '2cc269b3-1c85-4ccd-b148-588aceaa5df5': Document(id='2cc269b3-1c85-4ccd-b148-588aceaa5df5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 1, 'page_label': '2', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024\nCEO 메시지\

# 벡터 스토어 조작하기

## 문서 추가하기

In [None]:
chunk_docs[1:10]

In [31]:
load_db.add_documents(
    chunk_docs[1:10]
)

['7d23524a-05cf-4712-aa00-5c5702d1b949',
 'aff32032-b54d-4a85-a749-3b56f0071a01',
 'f378481b-2aeb-4b58-8f4b-de5c1c7ca8f2',
 '7a08ef8f-4d85-4c67-a85a-3cbe9028a55d',
 '9cd3e25d-ff7f-4c4d-a704-0fc6eba46057',
 '45ccdaa1-b5d1-44be-802a-a4d4a8f79132',
 '505af196-e3e6-46ec-aa10-9b92f2ce6054',
 'd361819a-5c46-4b91-ad5d-6cd6bcfcfb48',
 'eb2a6fff-5e1d-4665-8600-164cc5542e37']

In [32]:
load_db.index_to_docstore_id

{0: 'a56e3877-4efc-4a9c-985e-cdd953d789b8',
 1: '2cc269b3-1c85-4ccd-b148-588aceaa5df5',
 2: '7d23524a-05cf-4712-aa00-5c5702d1b949',
 3: 'aff32032-b54d-4a85-a749-3b56f0071a01',
 4: 'f378481b-2aeb-4b58-8f4b-de5c1c7ca8f2',
 5: '7a08ef8f-4d85-4c67-a85a-3cbe9028a55d',
 6: '9cd3e25d-ff7f-4c4d-a704-0fc6eba46057',
 7: '45ccdaa1-b5d1-44be-802a-a4d4a8f79132',
 8: '505af196-e3e6-46ec-aa10-9b92f2ce6054',
 9: 'd361819a-5c46-4b91-ad5d-6cd6bcfcfb48',
 10: 'eb2a6fff-5e1d-4665-8600-164cc5542e37'}

In [37]:
vectorstore_db_path = "../vectorstore/samsung_faiss.db"
index_name = "samsung2025"
load_db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

In [38]:
# 저장된 db 불러오기
updated_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings = embeddings,
    allow_dangerous_deserialization = True
)
updated_db.index_to_docstore_id

{0: 'a56e3877-4efc-4a9c-985e-cdd953d789b8',
 1: '2cc269b3-1c85-4ccd-b148-588aceaa5df5',
 2: '7d23524a-05cf-4712-aa00-5c5702d1b949',
 3: 'aff32032-b54d-4a85-a749-3b56f0071a01',
 4: 'f378481b-2aeb-4b58-8f4b-de5c1c7ca8f2',
 5: '7a08ef8f-4d85-4c67-a85a-3cbe9028a55d',
 6: '9cd3e25d-ff7f-4c4d-a704-0fc6eba46057',
 7: '45ccdaa1-b5d1-44be-802a-a4d4a8f79132',
 8: '505af196-e3e6-46ec-aa10-9b92f2ce6054',
 9: 'd361819a-5c46-4b91-ad5d-6cd6bcfcfb48',
 10: 'eb2a6fff-5e1d-4665-8600-164cc5542e37'}

## 직접 추가하기

In [39]:
from langchain_core.documents import Document

# 직접 추가하기
updated_db.add_documents(
    [
        Document(
            page_content= "새로운 문서는 이렇게 추가하기",
            metadata = {"source" : "수동"}
        ),
         Document(
            page_content= "2024년 삼성 전자 주시 사지마세요",
            metadata = {"source" : "윤택한"}
        ),
    ]
)

['3a93e0f8-1261-4f54-9743-401d090aa9ed',
 '95d6baff-d496-4f1b-b8ea-301352f6876c']

In [40]:
updated_db.index_to_docstore_id

{0: 'a56e3877-4efc-4a9c-985e-cdd953d789b8',
 1: '2cc269b3-1c85-4ccd-b148-588aceaa5df5',
 2: '7d23524a-05cf-4712-aa00-5c5702d1b949',
 3: 'aff32032-b54d-4a85-a749-3b56f0071a01',
 4: 'f378481b-2aeb-4b58-8f4b-de5c1c7ca8f2',
 5: '7a08ef8f-4d85-4c67-a85a-3cbe9028a55d',
 6: '9cd3e25d-ff7f-4c4d-a704-0fc6eba46057',
 7: '45ccdaa1-b5d1-44be-802a-a4d4a8f79132',
 8: '505af196-e3e6-46ec-aa10-9b92f2ce6054',
 9: 'd361819a-5c46-4b91-ad5d-6cd6bcfcfb48',
 10: 'eb2a6fff-5e1d-4665-8600-164cc5542e37',
 11: '3a93e0f8-1261-4f54-9743-401d090aa9ed',
 12: '95d6baff-d496-4f1b-b8ea-301352f6876c'}

## 삭제하기

In [42]:
updated_db.delete(['a56e3877-4efc-4a9c-985e-cdd953d789b8'])
updated_db.index_to_docstore_id

{0: '2cc269b3-1c85-4ccd-b148-588aceaa5df5',
 1: '7d23524a-05cf-4712-aa00-5c5702d1b949',
 2: 'aff32032-b54d-4a85-a749-3b56f0071a01',
 3: 'f378481b-2aeb-4b58-8f4b-de5c1c7ca8f2',
 4: '7a08ef8f-4d85-4c67-a85a-3cbe9028a55d',
 5: '9cd3e25d-ff7f-4c4d-a704-0fc6eba46057',
 6: '45ccdaa1-b5d1-44be-802a-a4d4a8f79132',
 7: '505af196-e3e6-46ec-aa10-9b92f2ce6054',
 8: 'd361819a-5c46-4b91-ad5d-6cd6bcfcfb48',
 9: 'eb2a6fff-5e1d-4665-8600-164cc5542e37',
 10: '3a93e0f8-1261-4f54-9743-401d090aa9ed',
 11: '95d6baff-d496-4f1b-b8ea-301352f6876c'}

In [45]:
updated_db.similarity_search("삼성 전자 주식", k=5)

[Document(id='95d6baff-d496-4f1b-b8ea-301352f6876c', metadata={'source': '윤택한'}, page_content='2024년 삼성 전자 주시 사지마세요'),
 Document(id='45ccdaa1-b5d1-44be-802a-a4d4a8f79132', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 4, 'page_label': '5', 'class': 'wanted'}, page_content='삼성전자 지속가능경영보고서 2024\n05\nOur Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People\n회사소개\nAbout Us\n삼성전자주식회사(이하 삼성전자)는 인재와 기술을 바탕으로 최고의 제품과 서비스를 창출하여 인류사회에 공헌하는 글로벌 초일류기업을 지향합니다. \n이를 위해 삼성전자의 경영철학을 반영한 5가지 핵심가치 를 수립하였고, 핵심가치를 세부원칙과 행동지침 으로 구체화하여 삼성전자 임직원이 \n지켜야 할 글로벌 행동규범(Global Code of Conduct) 을 제정하였습니다. 삼성전자는 조직문화에 5가지 핵심가치를 내재화하고 글로벌 행동규범을 \n모든 경영활동의 기준으로 삼아 지속적으로 성장해갈 것입니다. \n사업부문 및 글로벌 네트워크 소개\n삼성전자는 제품 특성에 따라 DX(Device eXperience)와 DS(Device Solutions) 2

# 벡터 스토어 합치기
- 물리적으로 합치기
- 검색기만 하이브리드로 사용

## 물리적으로 합치기

In [52]:
db1 = FAISS.from_documents(
    chunk_docs[0:10],
    embedding=embeddings
)

db2 = FAISS.from_documents(
    chunk_docs[10:20],
    embedding=embeddings
)

In [47]:
db1.merge_from(
    target=db2
)
db1.index_to_docstore_id

{0: '3368b358-6840-4d63-b421-dcb982c5255d',
 1: 'a571fd00-d35e-4eb0-badd-5548c33b42b6',
 2: '47ff41dc-d649-4815-97d0-08118cd26201',
 3: '66308087-1f64-471d-be48-90a856ae6121',
 4: '3f34c124-e221-4645-9d12-b3bd6d198d95',
 5: '2684078d-1a10-4d2b-b8d8-3f79c1d07e2c',
 6: 'f7000a47-85d0-4303-8789-df5022a76470',
 7: '58a32de8-cc52-483a-b25d-cdc5e7a0d48c',
 8: 'bac261c8-072e-4ae5-b3d5-1ab67473cb75',
 9: '43a67bbf-431b-4cea-89b1-fd931c8f5519',
 10: '398e7629-f32e-4a5d-be2f-8ba2acfef6d9',
 11: '4571cab0-5c8f-406e-b6dc-8a83f62bbb80',
 12: 'f9147c5c-f981-4af5-9cfe-0fb632a2f1b0',
 13: 'f3e5e451-5f80-4c2b-b2c9-900ccfa144ef',
 14: '3e11ff9b-6bad-45b0-be87-4f4c3255d552',
 15: 'd0527e66-7fd3-44a5-b4d2-d92e4467c2f0',
 16: '7a28c44e-1b3e-4220-866b-72e319555b70',
 17: 'a0476a09-a82f-4f3c-8cfc-05aa83c5c78b',
 18: '9ebf48ec-b270-4dba-af50-228aa558c94a',
 19: 'de85ff58-94f9-4a0c-9d40-16569c85732c'}

In [48]:
db2.index_to_docstore_id

{0: '398e7629-f32e-4a5d-be2f-8ba2acfef6d9',
 1: '4571cab0-5c8f-406e-b6dc-8a83f62bbb80',
 2: 'f9147c5c-f981-4af5-9cfe-0fb632a2f1b0',
 3: 'f3e5e451-5f80-4c2b-b2c9-900ccfa144ef',
 4: '3e11ff9b-6bad-45b0-be87-4f4c3255d552',
 5: 'd0527e66-7fd3-44a5-b4d2-d92e4467c2f0',
 6: '7a28c44e-1b3e-4220-866b-72e319555b70',
 7: 'a0476a09-a82f-4f3c-8cfc-05aa83c5c78b',
 8: '9ebf48ec-b270-4dba-af50-228aa558c94a',
 9: 'de85ff58-94f9-4a0c-9d40-16569c85732c'}

## 빈 DB에 합치기

In [53]:
db1 = FAISS.from_documents(
    chunk_docs[0:10],
    embedding=embeddings
)

db2 = FAISS.from_documents(
    chunk_docs[10:20],
    embedding=embeddings
)

In [57]:
db1.index_to_docstore_id

{0: '9092f90c-fba6-4c71-b08e-14312a840514',
 1: '7d5706c4-9b0b-4a00-b47b-4ddf173e299e',
 2: '391d5017-81cf-46f3-b754-6c08ef1174a2',
 3: 'bf01598b-5691-4d98-a7d2-57d3e480e47b',
 4: 'b09adaf9-0b5e-4e40-a637-e8d366f83c62',
 5: 'e7f43c52-e005-43e2-96e4-f6ea9cd84dc6',
 6: 'eda393de-0de1-42fc-ac24-8259bbc2d933',
 7: '36f3b3ae-aca3-455b-9d61-b768e08843c4',
 8: '62ffb86f-1e40-4330-b01c-90f2b243cb7b',
 9: 'f14759a4-d9eb-4536-a072-8002e654d085'}

In [58]:
db2.index_to_docstore_id

{0: '2079151e-a259-4f93-bcfe-bcfd8c9aaa98',
 1: '0fdf7f2f-fee4-49af-ace8-72576c5651f8',
 2: 'cbf8585f-37db-48ad-9bad-f4073d1c6987',
 3: '7bedd85d-4bfe-4510-8082-17c347292092',
 4: '5dfad63c-25f0-415e-a93b-e11644079c9a',
 5: '4b2aacc5-32ac-4bef-8abc-6dc6bfff2b63',
 6: '8294551f-f038-48ad-846c-7a2c1faeba4b',
 7: '52679d31-fe7c-4290-8bfc-121f2a97eb02',
 8: 'e9764394-d0f1-44eb-a72d-e1f15987657e',
 9: 'bf5fe1cf-e899-4bed-ac37-a02b84702019'}

In [64]:
db3 = FAISS(
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    embedding_function = embeddings,
    index = faiss.IndexFlatL2(dim_size) # 차원을 맞춰주기 위해
    )
db3.index_to_docstore_id

{}

In [65]:
db3.merge_from(
    target=db1
)
db3.merge_from(
    target=db2
)

db3.index_to_docstore_id

{0: '9092f90c-fba6-4c71-b08e-14312a840514',
 1: '7d5706c4-9b0b-4a00-b47b-4ddf173e299e',
 2: '391d5017-81cf-46f3-b754-6c08ef1174a2',
 3: 'bf01598b-5691-4d98-a7d2-57d3e480e47b',
 4: 'b09adaf9-0b5e-4e40-a637-e8d366f83c62',
 5: 'e7f43c52-e005-43e2-96e4-f6ea9cd84dc6',
 6: 'eda393de-0de1-42fc-ac24-8259bbc2d933',
 7: '36f3b3ae-aca3-455b-9d61-b768e08843c4',
 8: '62ffb86f-1e40-4330-b01c-90f2b243cb7b',
 9: 'f14759a4-d9eb-4536-a072-8002e654d085',
 10: '2079151e-a259-4f93-bcfe-bcfd8c9aaa98',
 11: '0fdf7f2f-fee4-49af-ace8-72576c5651f8',
 12: 'cbf8585f-37db-48ad-9bad-f4073d1c6987',
 13: '7bedd85d-4bfe-4510-8082-17c347292092',
 14: '5dfad63c-25f0-415e-a93b-e11644079c9a',
 15: '4b2aacc5-32ac-4bef-8abc-6dc6bfff2b63',
 16: '8294551f-f038-48ad-846c-7a2c1faeba4b',
 17: '52679d31-fe7c-4290-8bfc-121f2a97eb02',
 18: 'e9764394-d0f1-44eb-a72d-e1f15987657e',
 19: 'bf5fe1cf-e899-4bed-ac37-a02b84702019'}