In [1]:
# !pip install -q langchain openai tiktoken chromadb

In [2]:
# !wget -q https://github.com/kairess/toy-datasets/raw/master/techcrunch-articles.zip
# !unzip -q techcrunch-articles.zip -d articles

In [3]:
import os

os.environ["OPENAI_API_KEY"] = ""


In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

In [5]:
# loader = TextLoader('single_text_file.txt')
# loader = DirectoryLoader('./articles', glob="*.txt", loader_cls=TextLoader)
# loader = DirectoryLoader('./articles', glob="*.txt", loader_cls=TextLoader)
# loader = DirectoryLoader('./', glob="*.pdf", loader_cls=PyPDFLoader)
loader = DirectoryLoader('./articles', glob="*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

len(documents)

FileNotFoundError: Directory not found: './articles'

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# 1000글자씩 분할하기
# 끊기는것을 방지하지 위해 겹치는 부분을 200자로 제한함.
texts = text_splitter.split_documents(documents)

len(texts)

2424

In [None]:
texts[2:4]

[Document(page_content='70.7%\n12.6%\n6.3%10.4%37.8%\n18.2%\n9.4%34.6%\n귀농 귀촌\n1\n2\n3\n4\n5\n6', metadata={'source': 'articles\\(농업정책관-청년농육성정책팀) 2022년 귀농귀촌실태조사 결과 보도자료(3.3. 조간)_별첨(2022년 귀농귀촌실태조사 결과  인포그래픽).pdf', 'page': 0}),
 Document(page_content='∙\n일반직장\n취업자영업 임시직 농사일 비농업부문\n일용직농업임금노동\n6.4% 83.3%귀농 귀촌 (만원)\n2,3183,099 3,055 3,1483,186 3,2063,4543,6853,877 3,929 3,8084,045\n귀농, 귀촌첫해\n(전체평균)1년차 2년차 3년차\n 5년차\n57.4%\n22.7%\n7.4% 5.0% 3.1% 2.6%\n귀농\n 귀촌\n∙', metadata={'source': 'articles\\(농업정책관-청년농육성정책팀) 2022년 귀농귀촌실태조사 결과 보도자료(3.3. 조간)_별첨(2022년 귀농귀촌실태조사 결과  인포그래픽).pdf', 'page': 1})]

## 2. Create Chroma DB

In [None]:
# !pip install sentence-transformers

In [None]:
# 임베딩 모델 로드
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="jhgan/ko-sroberta-multitask", encode_kwargs={'normalize_embeddings': True}
)

# Chroma DB에 벡터화하여 저장하기
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(texts, embeddings, persist_directory="./chroma_db")

In [None]:
# persist_directory = 'db' # 'db' 디렉토리에 저장함.
# 
# embedding = OpenAIEmbeddings()
# # openai의 embedding을 사용함
# 
# vectordb = Chroma.from_documents(
#     documents=texts, 
#     embedding=embedding,
#     persist_directory=persist_directory)


In [None]:
# 초기화 하는 하는 과정
vectordb.persist()
vectordb = None

persist_directory = 'chroma_db'

In [None]:
vectordb = Chroma(
    persist_directory=persist_directory, 
    embedding_function=embeddings)

## 3. Make a retriever

In [None]:
retriever = vectordb.as_retriever()
# docs = retriever.get_relevant_documents("What is Generative AI?")

# for doc in docs:
#     print(doc.metadata["source"])

## 4. Make a chain

In [None]:
from langchain.llms import HuggingFacePipeline
import os
# model_id = 'maywell/Synatra-42dot-1.3B'
model_id = 'zomd/AISquare-Instruct-yi-ko-6b-v0.9.30'
os.environ['HF_HOME'] = r'./models/'

llm = HuggingFacePipeline.from_model_id(
    model_id=model_id, 
    device=0,               # -1: CPU(default), 0번 부터는 CUDA 디바이스 번호 지정시 GPU 사용하여 추론
    task="text-generation", # 텍스트 생성
    model_kwargs={"temperature": 0.2, 
                  "max_length": 200},
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

## 5. Query

In [None]:
query = "벼 농사가 망했을때 보상받을 수 있는 방법에 대해 알려줘"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
llm_response

In [None]:
# query = "Who led the round in Pando?"
# llm_response = qa_chain(query)
# process_llm_response(llm_response)