In [None]:
#rag(검색 증강 생성) : 모델은 많은 데이터를 통해 학습을 하지만 개인적인 데이터에 접근 할 수 없다.(개인 DB나 문서)
#RAG는 NLP의 두 가지 주요 구성 요소인 '정보 검색'과 '응답 생성'을' 결합

In [10]:
# Data Loader ans splitters

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader("./files/chapter_one.docx")





In [9]:
#Tictoken

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

# CharacterTextSplitter 설정
# LangChain의 tiktoken 기반 텍스트 분할기를 사용하면, 특정 조건에 맞추어 텍스트를 조각으로 나누는 작업을 보다 효율적으로 수행할 수 있습니다
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")


In [8]:
#처음 문서를 잘 split하는게 좋은것이 크게 자르면 많은 데이터가 LLM에 넘어가 비용이 많이나오고, 엄청 많이 자르게되면 잘못된 문맥이 LLM에 넘어가기 때문
#Embedding 텍스트를 컴퓨터가 알아들을수 있게 변환하는 작업(텍스트를 벡터로 변환하는 과정)
#vector를 이용해서 비슷한 문서를 검색(벡터 스토어)
#Openai는 1000개가 넘는 vector가지고 있다. -> 그만큼 정교하게 가능

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma # Chroma 벡터스토어
from langchain.storage import LocalFileStore


cache_dir = LocalFileStore("./.cache/") # local cache directory


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)
vectorstore = Chroma.from_documents(docs, cached_embeddings)

results = vectorstore.similarity_search("where does winston live")

results


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kyungminlee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kyungminlee/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[Document(page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liqu

In [11]:
# LLMChain은 레거시, LCEL(Langchain Expression Language)사용
# stuff documents -> 모든 도큐먼트를 프롬프트에 채워넣는것
# map_rerank chain -> 프롬프트를 통해 질문하고 점수를 매김.

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS # Chroma 벡터스토어
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI(temperature=0.1)
cache_dir = LocalFileStore("./.cache/") # local cache directory


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)
vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = "map_rerank", #체인타입만 간단하게 변경해서 조회 가능
    retriever = vectorstore.as_retriever(),
)

chain.run("Describe Victory Mansions")




'Victory Mansions is a building with a hallway that smells of boiled cabbage and old rag mats. It has a colored poster of a man\'s face, about forty-five years old, with a heavy black mustache and ruggedly handsome features. The building has seven flights of stairs, with a poster of an enormous face on each landing opposite the lift-shaft. The building has a telescreen that cannot be completely shut off, and the voice from it was reading out a list of figures related to pig-iron production. The building is described as having a fruity voice and a smallish, frail figure in blue overalls. The building is part of the economy drive in preparation for Hate Week, with the caption "BIG BROTHER IS WATCHING YOU" beneath the enormous face poster.'

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)
retriver = vectorstore.as_retriever()
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}",
        ),
        ("human", "{question}"),
    ]
)

chain = (
    {
        "context": retriver,
        "question": RunnablePassthrough(), #invoke 값 그대로(RunnablePassthrough())
    }
    | prompt
    | llm
)

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith lives. It is described as having glass doors that let in gritty dust, a hallway that smells of boiled cabbage and old rag mats, and a poster of a large face with the caption "BIG BROTHER IS WATCHING YOU." The building has a faulty lift, forcing residents to climb stairs. The flat where Winston lives is seven flights up, and the building is part of the city of London in Airstrip One, a province of Oceania.')