In [37]:
!pip install pinecone langchain langchain_pinecone langchain-openai langchain-community pypdf python-dotenv

Collecting langchain-openai
  Downloading langchain_openai-0.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_openai-0.2.0-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.2.0


In [38]:
import os
from dotenv import load_dotenv
import pinecone
from pinecone import ServerlessSpec
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter # To split the text into smaller chunks
from langchain_openai import OpenAIEmbeddings # To create embeddings
from langchain_pinecone import PineconeVectorStore # To connect with the Vectorstore
from langchain_community.document_loaders import DirectoryLoader # To load files in a directory
from langchain_community.document_loaders import PyPDFLoader # To parse the PDFs

In [39]:
# os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"] = ""
os.environ["PINECONE_API_KEY"] = "cbf01feb-d619-452b-a8a5-b4f5a4a7e9ae"

In [40]:
index_name = "test" #give the name to your index, or you can use an index which you created previously and load that.
#here we are using the new fresh index name
pc = Pinecone(api_key="cbf01feb-d619-452b-a8a5-b4f5a4a7e9ae")
#Get your Pinecone API key to connect after successful login and put it here.
pc

<pinecone.control.pinecone.Pinecone at 0x7eedddfbd780>

In [43]:
import time

if index_name in pc.list_indexes().names():
  print("index already exists" , index_name)
  index= pc.Index(index_name) #your index which is already existing and is ready to use
  print(index.describe_index_stats())

else: #crate a new index with specs
  pc.create_index(
  name=index_name,
  dimension=1536, # Replace with your model dimensions
  metric="cosine", # Replace with your model metric
  spec=ServerlessSpec(
  cloud="aws",
  region="us-east-1")
)

while not pc.describe_index(index_name).status["ready"]:
  time.sleep(1)
index= pc.Index(index_name)
print("index created")
print(index.describe_index_stats())


index already exists test
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
index created
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [45]:
DATA_DIR_PATH = "/content/pdfs" # Directory containing our PDF files
CHUNK_SIZE = 1024 # Size of each text chunk for processing
CHUNK_OVERLAP = 0 # Amount of overlap between chunks
INDEX_NAME = index_name # Name of our Pinecone index

In [46]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
loader = DirectoryLoader(
    path=DATA_DIR_PATH, # Directory containing our PDFs
    glob="**/*.pdf", # Pattern to match PDF files (including subdirectories)
    loader_cls=PyPDFLoader # Specifies we're loading PDF files
)
docs = loader.load()# This loads all matching PDF files
print(f"Total Documents loaded: {len(docs)}")

Total Documents loaded: 18


In [48]:
docs[0]

Document(metadata={'source': '/content/pdfs/rachelgreecv.pdf', 'page': 0}, page_content='3 grad.illinois.edu/CareerDevelopment Rachel Green  \n2 1 0  W .  G R E E N  S T . ,  C H A M P A I G N ,  I L  \n( 2 1 7 )  5 5 5 - 1 2 3 4  •  R S T U D E N T @ I L L I N O I S . E D U  \nEDUCATION  \nPhD in English May 20xx \nUniversity of Illinois at Urbana-Champaign \nDissertation title:  “Down on the Farm: World War One and the Emergence of Literary  \nModernism in the American South”  \nCommittee : Margaret Black, Naomi Blue, John Jay, Robert Roberts (Chair) \nMA in English  20xx \nUniversity of Illinois at Urbana-Champaign \nBA in English and Communications, summa cum laude  20xx \nButler University, Indianapolis, IN  \nTEACHING  & A DVISING   \nComposition Instructor  20xx-present \nResearch Writing Program, University of Illinois \n\uf0b7Facilitator for seven sections of English composition.\n\uf0b7Planned and taught a writing-intensive course based upon current events.\n\uf0b7Used instru

In [50]:
type(docs[14])

In [51]:
# we can convert the Document object to a python dict using the .dict() method.
print(f"keys associated with a Document: {docs[0].dict().keys()}")

keys associated with a Document: dict_keys(['id', 'metadata', 'page_content', 'type'])


In [52]:
print(f"{'-'*15}\nFirst 100 charachters of the page content: {docs[0].page_content[:100]}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[0].metadata}\n{'-'*15}")
print(f"Datatype of the document: {docs[0].type}\n{'-'*15}")

---------------
First 100 charachters of the page content: 3 grad.illinois.edu/CareerDevelopment Rachel Green  
2 1 0  W .  G R E E N  S T . ,  C H A M P A I G
---------------
Metadata associated with the document: {'source': '/content/pdfs/rachelgreecv.pdf', 'page': 0}
---------------
Datatype of the document: Document
---------------


In [53]:
#  We loop through each document and add additional metadata - filename, quarter, and year
for doc in docs:
  filename = doc.dict()['metadata']['source'].split("\\")[-1]
  #quarter = doc.dict()['metadata']['source'].split("\\")[-2]
  #year = doc.dict()['metadata']['source'].split("\\")[-3]
  doc.metadata = {"filename": filename, "source": doc.dict()['metadata']['source'], "page": doc.dict()['metadata']['page']}

# To veryfy that the metadata is indeed added to the document
print(f"Metadata associated with the document: {docs[0].metadata}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[1].metadata}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[2].metadata}\n{'-'*15}")
print(f"Metadata associated with the document: {docs[3].metadata}\n{'-'*15}")

Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 0}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 1}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 2}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/yolov7paper.pdf', 'source': '/content/pdfs/yolov7paper.pdf', 'page': 0}
---------------


In [54]:
for i in range(len(docs)) :
  print(f"Metadata associated with the document: {docs[i].metadata}\n{'-'*15}")

Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 0}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 1}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/rachelgreecv.pdf', 'source': '/content/pdfs/rachelgreecv.pdf', 'page': 2}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/yolov7paper.pdf', 'source': '/content/pdfs/yolov7paper.pdf', 'page': 0}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/yolov7paper.pdf', 'source': '/content/pdfs/yolov7paper.pdf', 'page': 1}
---------------
Metadata associated with the document: {'filename': '/content/pdfs/yolov7paper.pdf', 'source': '/content/pdfs/yolov7paper.pdf', 'page': 2}
---------------
Metadata associated with the document: {'filename': '/content/pd

In [55]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=1024,
  chunk_overlap=0
)
documents = text_splitter.split_documents(docs)

In [56]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=CHUNK_SIZE,
  chunk_overlap=CHUNK_OVERLAP
)
documents = text_splitter.split_documents(docs)
len(docs), len(documents)
#output ;
(25, 118)

(25, 118)

In [57]:
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002") # Initialize the embedding model
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7eedd06f02b0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7eedd06f1690>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [59]:
docs_already_in_pinecone = input("Are the vectors already added in DB: (Type Y/N)")
# check if the documents were already added to the vector database
if docs_already_in_pinecone == "Y" or docs_already_in_pinecone == "y":
  docsearch = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
  print("Existing Vectorstore is loaded")
# if not then add the documents to the vectore db
elif docs_already_in_pinecone == "N" or docs_already_in_pinecone == "n":
  docsearch = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)
  print("New vectorstore is created and loaded")
else:
  print("Please type Y - for yes and N - for no")

Are the vectors already added in DB: (Type Y/N)N
New vectorstore is created and loaded


In [60]:
#Using the Vector Store for Retrieval
# Here we are defing how to use the loaded vectorstore as retriver
retriver = docsearch.as_retriever()
retriver.invoke("what is yolo?")#

[Document(id='47a095b0-cfc4-4478-931d-5dc8f242f932', metadata={'filename': '/content/pdfs/yolov7paper.pdf', 'page': 11.0, 'source': '/content/pdfs/yolov7paper.pdf'}, page_content='[23] Jocher Glenn. YOLOv5 release v6.1. https://github.com/\nultralytics/yolov5/releases/tag/v6.1, 2022. 2, 7, 10\n[24] Shuxuan Guo, Jose M Alvarez, and Mathieu Salzmann. Ex-\npandNets: Linear over-parameterization to train compact\nconvolutional networks. Advances in Neural Information\nProcessing Systems (NeurIPS) , 33:1298–1310, 2020. 2\n[25] Kai Han, Yunhe Wang, Qi Tian, Jianyuan Guo, Chunjing\nXu, and Chang Xu. GhostNet: More features from cheap\noperations. In Proceedings of the IEEE/CVF Conference on\nComputer Vision and Pattern Recognition (CVPR) , pages\n1580–1589, 2020. 1\n[26] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.\nDeep residual learning for image recognition. In Proceed-\n12'),
 Document(id='05fc01f2-1393-4ea2-8c2b-65ecec955741', metadata={'filename': '/content/pdfs/yolov7paper.pd

In [62]:
query = "YOLOv7 outperforms which models"

docs = docsearch.similarity_search(query, k=3)

docs

[Document(id='df94cf4f-a7be-43b3-a64b-b0e9144ed0d3', metadata={'filename': '/content/pdfs/yolov7paper.pdf', 'page': 9.0, 'source': '/content/pdfs/yolov7paper.pdf'}, page_content='YOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS and\nhas the highest accuracy 56.8% AP test-dev / 56.8% AP\nmin-val among all known real-time object detectors with 30\nFPS or higher on GPU V100. YOLOv7-E6 object detector\n(56 FPS V100, 55.9% AP) outperforms both transformer-\nbased detector SWIN-L Cascade-Mask R-CNN (9.2 FPS\nA100, 53.9% AP) by 509% in speed and 2% in accuracy,and convolutional-based detector ConvNeXt-XL Cascade-\nMask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed\nand 0.7% AP in accuracy, as well as YOLOv7 outperforms:\nYOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, De-\nformable DETR, DINO-5scale-R50, ViT-Adapter-B and\nmany other object detectors in speed and accuracy. More\nover, we train YOLOv7 only on MS COCO dataset from\nscratch 

In [63]:
llm = OpenAI()

  llm = OpenAI()


In [64]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


In [65]:
query = "YOLOv7 outperforms which models"

In [66]:
qa.run(query)

  qa.run(query)


' YOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable DETR, DINO-5scale-R50, ViT-Adapter-B and many other object detectors.'

In [67]:
query = "Rachel Green Experience"
qa.run(query)

' Rachel Green has experience as a composition instructor and has received various awards and honors for her academic achievements. She has also presented at several conferences and has publications in academic journals and books.'

In [68]:
import sys

while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

Input Prompt: what is yolo v7


  result = qa({'query': user_input})


Answer:  YOLOv7 is a real-time object detection model that has surpassed all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS. It has the highest accuracy of 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. It is a trainable bag-of-freebies that sets a new state-of-the-art for real-time object detectors.
Input Prompt: exit
Exiting


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# https://www.analyticsvidhya.com/blog/2024/06/pinecone-vector-databases/