In [None]:
# # 필요한 라이브러리 설치
# !pip install langchain
# !pip install huggingface_hub transformers datasets
# !pip install python-dotenv
# !pip install pypdf
# !pip install sentence-transformers
# !pip install chromadb
# pdf2docx, docx2pdf, docx2txt

In [None]:
from langchain_community.llms import HuggingFaceEndpoint
from huggingface_hub import login
from dotenv import load_dotenv
import os

# API KEY 정보로드
load_dotenv()
os.getenv('HUGGINGFACEHUB_API_TOKEN')

# login()

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN']

# pdf 정보 추출

In [70]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFaceEndpoint
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain_community.embeddings import (
    HuggingFaceEmbeddings,
    HuggingFaceBgeEmbeddings,
)


In [None]:
loader = PyPDFLoader("real_data_ex.pdf")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
    )

pages = loader.load_and_split(text_splitter)

In [None]:
directory = 'index_store'
vector_index = Chroma.from_documents(
    pages, # Documents
    HuggingFaceEmbeddings(), # Text embedding model
    persist_directory=directory # persists the vectors to the file system
    )
vector_index.persist()

In [79]:
retriever = vector_index.as_retriever(
    search_type="similarity", # Cosine Similarity
    search_kwargs={
        "k": 5, # Select top k search results
    } 
)

In [None]:
retriever.get_relevant_documents("회사이름이 뭐지?")

In [None]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# repo_id = "chihoonlee10/T3Q-ko-solar-dpo-v1.0"
# repo_id = "google/gemma-7b"
# repo_id = 'google/flan-t5-xxl'

llm = HuggingFaceEndpoint(
    repo_id=repo_id, 
    # max_new_tokens=256,  
    temperature=0.1, 
    callbacks=[StreamingStdOutCallbackHandler()], 
    streaming=True,  
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True
)

In [None]:
qa_chain.invoke("2차전형은 뭐가 있지")

## 다른 pdf 파일을 로딩하고 임베딩으로 변환하여 Vector DB에 추가하기


In [47]:
loader = PyPDFLoader("example.pdf")

pages_new = loader.load_and_split(text_splitter)

_ = vector_index.add_documents(pages_new)

# Create a persistent, file-based vector store, using Chroma vector store.
vector_index.persist()

In [None]:
# Query the pdf file
# 새로 추가하고 난 질문
qa_chain.invoke("2차전형은 뭐가 있지")

In [49]:
## Adding memory to conversations
# Instead of the RetrievalQA chain, use the ConversationalRetrievalChain. 
# ConversationalRetrievalChain allows to seamlessly add historical context or memory to chain. - 과거 채팅도 이용
conv_chain = ConversationalRetrievalChain.from_llm(
    llm, 
    retriever=retriever,
)

In [None]:
# Initialize our chat history. 
# chat history
chat_history = []
query = "어떤 공고지?"
result = conv_chain.invoke({"question": query, "chat_history": chat_history})

print(result["answer"])
chat_history.append((query, result["answer"]))

In [None]:
query = "이 공고 한국어로 요약좀 해줘"
result = conv_chain.invoke({"question": query, "chat_history": chat_history})

# print(result["answer"])

In [None]:
# chat history
chat_history.append((query, result["answer"]))
chat_history