### Installation

In [None]:
!pip install langchain
!pip install unstructured
!pip install openai
!pip install chromadb
!pip install Cython
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.216-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.8-py3-none-any.whl (26 kB)
Collecting langchainplus-sdk>=0.0.17 (from langchain)
  Downloading langchainplus_sdk-0.0.17-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.3.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)
  Downloading marshmallow-3.19.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1

### Load Required Packages

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

### OpenAI API Key

In [None]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = ""

### Connect Google Drive

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [None]:
#all files in the google drive
pdf_folder_path = f'{root_dir}/data/'
os.listdir(pdf_folder_path)

['test1.pdf', 'test2.pdf', 'test3.pdf', 'test4.pdf']

### Load Multiple PDF files

In [None]:
# location of the pdf file/files.
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

In [None]:
loaders

[<langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7f60c0935300>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7f60c0935030>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7f60c0935060>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7f60c0935000>]

### Vector Store
Chroma as vectorstore to index and search embeddings


There are three main steps going on after the documents are loaded:

- Splitting documents into chunks

- Creating embeddings for each document

- Storing documents and embeddings in a vectorstore


In [None]:
#loading all pdf files
index = VectorstoreIndexCreator().from_loaders(loaders)
index

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2

In [None]:
#input the query for which you want the related document
query_vector=input()

In [None]:
#store it in result for further processing
result = index.query(query_vector)
result

' Document errors are mistakes made when creating or using documents, such as using ditto marks, using signature stamps, failing to use ink as specified by procedure, or incorrect ink used for entries causing illegible writing.'

In [None]:
pdf_folder_path = '/content/gdrive/My Drive/data/'
os.listdir(pdf_folder_path)

['test1.pdf', 'test2.pdf', 'test3.pdf']

In [None]:
# location of the pdf file/files.
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
index = VectorstoreIndexCreator().from_loaders(loaders)
index



VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f59043d1040>)

In [None]:
index.query(result)

' Yes, document errors can include using ditto marks, using signature stamps, failing to use ink as specified by procedure, or incorrect ink used for entries causing illegible writing.'

In [None]:
index.query_with_sources(result)

{'question': 'What are document errors',
 'answer': ' Document errors are mistakes made in the creation and use of documentation, such as incorrect ink used for entries, use of ditto marks, use of signature stamps, and failure to use ink as specified by procedure.\n',
 'sources': '/content/gdrive/My Drive//data/test3.pdf'}

In [None]:
index.sources(result)

NameError: ignored