# LLM VectorDB 저장
#### w/ Chroma

In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pandas as pd

# VectorDB
from langchain_community.vectorstores import Chroma

In [2]:
### 벡터스토어 값 추가/제거/수정 테스트를 위한 데이터 만들기
csv_path = '../data/second.csv'
df1 = pd.read_csv(csv_path, encoding='utf8')
colnames = df1.columns

In [3]:
# view_for_test100 = df1[:100]
# view_for_test100.shape

(100, 5)

In [4]:
# view_for_test100.to_csv('../data/view_for_test100.csv', encoding='utf8', index=False)

## 100개의 데이터만으로 벡터스토어에 저장 >> second.csv

In [3]:
### 01. CSV 파일에서 문서 로드 ###
loader = CSVLoader('../data/second.csv', encoding='utf8')
docs = loader.load()
print(f"문서의 수: {len(docs)}")

### 02. pandas로 데이터프레임 칼럼명 가져오기
csv_path = '../data/second.csv'
df2 = pd.read_csv(csv_path, encoding='utf8')
colnames = df2.columns

문서의 수: 1025275


In [4]:
colnames

Index(['user_id', 'asset_id', 'use_tms/runtime', 'runtime', 'datetime'], dtype='object')

In [5]:
df2

Unnamed: 0,user_id,asset_id,use_tms/runtime,runtime,datetime
0,user017439,cjc|M5217400LSGN27610201,0.395,6600.0,2023-11-27 21:52:32
1,user017439,cjc|M5212995LFON18981501,0.538,6180.0,2023-11-05 21:34:42
2,user017439,cjc|M4909466LSGG15304501,0.239,8340.0,2023-09-18 22:45:03
3,user017439,cjc|M0455294LSGL12927301,0.081,6240.0,2023-01-23 00:22:37
4,user017439,cjc|M4245102LSGJ92413901,0.392,8400.0,2023-08-23 22:17:45
...,...,...,...,...,...
1025270,user041480,cjc|M5063448LFOL10618701,0.425,5460.0,2023-01-02 22:06:22
1025271,user041480,cjc|M5085138LFOL57791401,0.060,5280.0,2023-03-13 19:57:59
1025272,user041480,cjc|M5143064LSVL11224501,0.125,5280.0,2023-01-14 09:18:11
1025273,user041480,cjc|M5210797LFON08638801,0.884,7560.0,2023-11-09 20:25:49


In [6]:
### 03. 메타데이터 추가 ###
docs = []
for _, row in df2.iterrows():
  # 필요한 메타데이터 설정
  metadata = {
    'user_id': row['user_id'],
    'asset_id':row['asset_id'],
    'datetime': row['datetime']
  }
  # 각 행의 데이터를 문서로 변환
  doc = Document(
    page_content=str(row.to_dict()),
    metadata=metadata
  )
  docs.append(doc)

print(f"문서의 수: {len(docs)}")
print('[메타데이터 예시]\n', docs[50].metadata)

문서의 수: 1025275
[메타데이터 예시]
 {'user_id': 'user017439', 'asset_id': 'cjc|M4458787LSGM45146801', 'datetime': '2023-07-09 15:51:09'}


In [7]:
### 04. 데이터 청크 나누기 ###
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=150, chunk_overlap=0
)
splits = text_splitter.split_documents(docs)
print("split된 문서의 수:", len(splits))

split된 문서의 수: 1025275


In [8]:
### 05. 임베딩 모델 생성
# https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual
embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')

  embeddings = HuggingFaceEmbeddings(model_name='ibm-granite/granite-embedding-278m-multilingual')


In [9]:
### 06. 벡터스토어 생성 ###
## FAISS
vectorstore = Chroma.from_documents(
  documents=splits,
  embedding=embeddings,
  persist_directory="../data/views_vectorstore_chroma_second",
)

# 벡터스토어 로컬에 저장
print("Chroma 벡터스토어 생성 완료!")

Chroma 벡터스토어 생성 완료!


In [13]:
# 벡터스토어 로드
# new_vector_store = Chroma(persist_directory="../data/view_for_test100_chroma",
#                           embedding_function=embeddings)

  new_vector_store = Chroma(persist_directory="../data/view_for_test100_chroma",
