# LLM VectorDB 저장
#### w/ FAISS

In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pandas as pd

# VectorDB - FAISS
from langchain_community.vectorstores import FAISS

In [10]:
### 벡터스토어 값 추가/제거/수정 테스트를 위한 데이터 만들기
csv_path = '../data/view_for_vectorstore.csv'
df1 = pd.read_csv(csv_path, encoding='utf8')
colnames = df1.columns

In [11]:
view_for_test100 = df1[:100]
view_for_test100.shape

(100, 5)

In [12]:
view_for_test100.to_csv('../data/view_for_test100.csv', encoding='utf8', index=False)

## 100개의 데이터만으로 벡터스토어에 저장

In [13]:
### 01. CSV 파일에서 문서 로드 ###
loader = CSVLoader('../data/view_for_test100.csv', encoding='utf8')
docs = loader.load()
print(f"문서의 수: {len(docs)}")

### 02. pandas로 데이터프레임 칼럼명 가져오기
csv_path = '../data/view_for_test100.csv'
df2 = pd.read_csv(csv_path, encoding='utf8')
colnames = df2.columns

문서의 수: 100


In [14]:
colnames

Index(['user_id', 'asset_id', 'use_tms/runtime', 'runtime', 'datetime'], dtype='object')

In [16]:
df2

Unnamed: 0,user_id,asset_id,use_tms/runtime,runtime,datetime
0,user000001,cjc|M5147122LFON18980301,0.246,5280.0,2023-11-18 08:17:57
1,user000002,cjc|M4550583LSG861820101,0.504,6300.0,2023-11-18 15:42:42
2,user000003,cjc|M4721638LFOL80567201,0.163,6420.0,2023-11-18 20:19:46
3,user000004,cjc|M4458473LFON21067001,0.116,5700.0,2023-11-18 15:48:32
4,user000005,cjc|M4864275LFOL85031701,0.471,4560.0,2023-11-18 17:30:09
...,...,...,...,...,...
95,user000090,cjc|M5210797LFON08638801,0.138,7560.0,2023-11-18 12:03:26
96,user000091,cjc|M4126866LSG815329501,0.912,6300.0,2023-11-18 11:53:05
97,user000092,cjc|M5092074LSVK57904801,0.081,6000.0,2023-11-18 08:10:52
98,user000093,cjc|M4653683LFOM20024801,0.177,5640.0,2023-11-18 22:16:58


In [17]:
### 03. 메타데이터 추가 ###
docs = []
for _, row in df2.iterrows():
  # 필요한 메타데이터 설정
  metadata = {
    'user_id': row['user_id'],
    'asset_id':row['asset_id'],
  }
  # 각 행의 데이터를 문서로 변환
  doc = Document(
    page_content=str(row.to_dict()),
    metadata=metadata
  )
  docs.append(doc)

print(f"문서의 수: {len(docs)}")
print('[메타데이터 예시]\n', docs[50].metadata)

문서의 수: 100
[메타데이터 예시]
 {'user_id': 'user000050', 'asset_id': 'cjc|M4948857LFOJ44377301'}


In [18]:
### 04. 데이터 청크 나누기 ###
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=150, chunk_overlap=0
)
splits = text_splitter.split_documents(docs)
print("split된 문서의 수:", len(splits))

split된 문서의 수: 100


In [19]:
### 05. 임베딩 모델 생성
# https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual
embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')

  embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')


In [20]:
### 06. 벡터스토어 생성 ###
## FAISS
vectorstore = FAISS.from_documents(
  documents=splits,
  embedding=embeddings,
)

# 벡터스토어 로컬에 저장
vectorstore.save_local("views_FAISS_test100")
print("FIASS 벡터스토어 생성 완료!")

FIASS 벡터스토어 생성 완료!


In [53]:
# 벡터스토어 로드
# new_vector_store = FAISS.load_local("movies_FAISS_1630_0123",
                                    # embeddings=embeddings,
                                    # allow_dangerous_deserialization=True)