In [11]:
from dotenv import load_dotenv
import os

# load .env
load_dotenv()

API_KEY = os.environ.get('API_KEY')
ACCESS_KEY = os.environ.get('ACCESS_KEY')
SERVER = os.environ.get('SERVER')

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader("./files/attention_is_all_you_need.txt")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='The dominant sequence transduction models are based on complex recurrent or\n\nconvolutional neural networks that include an encoder and a decoder. The best\n\nperforming models also connect the encoder and decoder through an attention\n\nmechanism. We propose a new simple network architecture, the Transformer,\n\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\n\nentirely. Experiments on two machine translation tasks show these models to\n\nbe superior in quality while being more parallelizable and requiring significantly\n\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including\n\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\n\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after\n\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\n\nb

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

splitter = CharacterTextSplitter(separator='\n', chunk_size=600, chunk_overlap=100)
loader = UnstructuredFileLoader("./files/attention_is_all_you_need.txt")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring significantly', metadata={'source': './files/attention_is_all_you_need.txt'}),
 Document(page_content='be superior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establish

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

loader = UnstructuredFileLoader("./files/attention_is_all_you_need.pdf")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need', metadata={'source': './files/attention_is_all_you_need.pdf'}),
 Document(page_content='Attention Is All You Need\n\n3 2 0 2\n\ng u A 2\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nLlion Jones∗ Google Research llion@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com', metadata={'source': './files/attention_is_all_you_need.pdf'}),
 Document(page_content='Noam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com', metadata={'source': './files/attention_is_all_you_need.pdf'}),
 Document(page_content='Jakob Uszkoreit∗ Google Research usz@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com\n

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)


loader = UnstructuredFileLoader("./files/attention_is_all_you_need.txt")

loader.load_and_split(text_splitter=splitter)

[Document(page_content='The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the li

In [9]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedder = OpenAIEmbeddings()
vector = embedder.embed_query("Hi")
len(vector)

vector = embedder.embed_documents(['how', 'are', 'you longer sentences'])
print(len(vector), len(vector[0]))

3 1536


In [13]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/attention_is_all_you_need.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [15]:
results = vectorstore.similarity_search("what is the topic?")
print(results)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the li

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/attention_is_all_you_need.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_rerank",
    # retriever: 선별하여 가져오기
    retriever=vectorstore.as_retriever(),
)

chain.run("Describe Victory Mansions")