In [15]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [16]:
from langchain_community.document_loaders import PyPDFLoader
import copy
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_unstructured import UnstructuredLoader
import os
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_chroma import Chroma

In [17]:
file_path = "civ_4_manual.pdf"

In [18]:
loader_pdf = PyPDFLoader("civ_4_manual.pdf")

In [19]:
pages_pdf = loader_pdf.load()

In [None]:
pages_pdf

In [21]:
# Make a deep copy of the object to work with while keeping the original
pages_pdf_cut = copy.deepcopy(pages_pdf)

In [22]:
# split the document into a list of words and join the list into a string with a space between words
for i in pages_pdf_cut:
    i.page_content = ' '.join(i.page_content.split())

In [23]:
char_splitter = CharacterTextSplitter(
    separator = "",
    chunk_size = 500,
    chunk_overlap  = 50
)

document_char_split = char_splitter.split_documents(pages_pdf_cut)

In [24]:
len(document_char_split)

736

In [25]:
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [26]:
vectorstore = Chroma.from_documents(documents = document_char_split, 
                                    embedding = embedding, 
                                    persist_directory = "./civ-docs")

In [28]:
vectorstore_from_directory = Chroma(persist_directory = "./civ-docs", 
                                    embedding_function = embedding)

In [29]:
vectorstore_from_directory.get()

{'ids': ['bba64f22-f1cf-4164-9e27-b252d129be65',
  '379f6ada-7dae-4484-bfc7-6c7bb45db65b',
  '0bb10106-321c-495b-9fb0-9a403a3627fd',
  '11c32cd2-fa0b-4ddd-9f29-7e2f82e7f81e',
  '4d40ce70-2513-4036-bde8-69ac60886f5e',
  '4c4be844-5a11-44f8-b807-bb19d73012b3',
  'f054f301-23e6-40ed-b8ae-d20a2fa53ee1',
  'd8647656-864a-4903-a392-14eeeed9d26a',
  '5960eb5b-e382-41d9-8e24-78b5235520b4',
  'fa07c61a-95cc-4d29-8a6a-bec93fc65d7a',
  '05b885a3-7775-4d40-bc32-3fac60281a27',
  '3bbb7b75-e6ce-4cbd-bcc4-98ef7b51b33a',
  '45153e1b-4a87-436b-b950-4e6ad96f7abb',
  'f9ac6722-ff45-435b-a4c6-8838aa44a9e4',
  '6db54af6-79a2-4511-8d08-eda5860d01c2',
  'f3367d54-3143-461f-bfc2-43f4026eedd7',
  'ab641cd7-19db-48bb-9bd1-515e14acecab',
  'd5df775b-46e0-4a2b-913c-cd5aa23a0638',
  '7aa309b2-8537-4ee9-beeb-8da1b65e9513',
  'a6da9299-6c3b-4f26-b2b5-defbb194d922',
  '7c4345cd-ff43-44b3-883e-c0e0fc1b1b1e',
  'e19ada8c-aa8a-4ef0-84e4-7436b575c7d1',
  'ddab3b84-8817-4ef4-bf5f-e60780318acb',
  '8e48bd03-1a3e-4fdd-a0e2-

In [30]:
vectorstore_from_directory.get("a6da9299-6c3b-4f26-b2b5-defbb194d922")

{'ids': ['a6da9299-6c3b-4f26-b2b5-defbb194d922'],
 'embeddings': None,
 'documents': ['2kgames.com/civ4/support.htm\n8THE TUTORIAL\nCivilization IV is a big game. T o ease the learning curve, we\nhave provided a tutorial to teach you the basics of controlling\nyour empire.We seriously suggest that you check it out, espe-\ncially if you’re new to the world of Civilization.\nWHAT’S IN THE TUTORIAL\nThe tutorial is designed to teach the novice to play Civilization\nIV.The tutorial describes the interface, explains the basic con-\ncepts in the game, and shows you what you need to do to win'],
 'uris': None,
 'data': None,
 'metadatas': [{'creationdate': '2006-08-17T16:34:17-04:00',
   'creator': 'PyPDF',
   'moddate': '2006-08-17T16:35:50-04:00',
   'page': 5,
   'page_label': '6',
   'producer': 'Acrobat Distiller 7.0.5 for Macintosh',
   'source': 'civ_4_manual.pdf',
   'title': 'untitled',
   'total_pages': 116}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metada