In [70]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
import os

In [2]:
from dotenv import load_dotenv
load_dotenv()
OPENAI_KEY = os.getenv('openai_key')
PINECONE_KEY = os.getenv('pinecone_key')
PINECONE_ENV = os.getenv('pinecone_env')

In [3]:
# set enviorn
os.environ['OPENAI_API_KEY'] = OPENAI_KEY
os.environ['PINECONE_INDEX_NAME'] = PINECONE_ENV
os.environ['PINECONE_API_KEY'] = PINECONE_KEY

In [5]:
loader=PyPDFDirectoryLoader('pdf')

In [6]:
data=loader.load()

In [7]:
data

[Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and

In [34]:
# divide the paragraph into token, the method
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,  chunk_overlap=20)

In [36]:
# chunks are split data
text_chunks = text_splitter.split_documents(data)

In [51]:
# need to use another pdf module to split the document into pure text
text_chunks2 = text_splitter.split_text(data)

TypeError: expected string or bytes-like object

In [10]:
len(text_chunks)

152

In [11]:
print(text_chunks[0].page_content)

YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object
detectors
Chien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1
1Institute of Information Science, Academia Sinica, Taiwan
kinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw
Abstract
YOLOv7 surpasses all known object detectors in both
speed and accuracy in the range from 5 FPS to 160 FPS
and has the highest accuracy 56.8% AP among all known


In [12]:
print(text_chunks[1].page_content)

real-time object detectors with 30 FPS or higher on GPU
V100. YOLOv7-E6 object detector (56 FPS V100, 55.9%
AP) outperforms both transformer-based detector SWIN-
L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by
509% in speed and 2% in accuracy, and convolutional-
based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6
FPS A100, 55.2% AP) by 551% in speed and 0.7% AP
in accuracy, as well as YOLOv7 outperforms: YOLOR,
YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable


## Check openai's embedding parameter's length

In [13]:
# openai embedding need the openai key
op_embed=OpenAIEmbeddings()

  warn_deprecated(


In [None]:
# transfer the query to vector
l1 = op_embed.embed_query('How are you')
l2 = op_embed.embed_query('What your name?')

In [None]:
# openai will have 1536 embedding parameter
len(l1)
len(l2)

## Create with existed pinecone index

In [43]:
vs = PineconeVectorStore(embedding=op_embed)

In [44]:
# use this function can add the text to pinecone
pinevec = vs.from_documents(
    documents=text_chunks, 
    index_name=PINECONE_ENV,
    embedding=op_embed
)

In [46]:
pinevec

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x14fdf6700>

In [42]:
# add the vector to existed pinecone
# pinevec.add_documents(text_chunks)

['4dbae29f-afdd-42c6-aa9e-b83865553135',
 'aa11dbea-2e18-4557-9fd7-41c0e55b7ad9',
 'f51d1ffa-2d88-45f4-8298-4f376004ada7',
 '11d74673-71bf-4c78-8dee-cf2de03fca2f',
 '92b08071-2dca-4567-8a13-5c7bd1aeda67',
 '3a5fb25c-52e6-4ed0-9f87-60e9b603368a',
 '2dfe54d6-a34d-4335-bacc-0c0551dbeed4',
 '0239eb80-ba7b-4c74-b8cf-344328d925b0',
 '9548eefc-e629-4287-83e7-ae81701f868a',
 '0f2db116-8b2f-46a9-adaa-bad58f96c604',
 '79b2832d-07a5-4339-9ee9-4ef92464d399',
 'dbe22dd4-ffa6-4d8b-8a94-ec3eb7a0794d',
 '6290a615-99d7-4954-a123-8458707db838',
 '3d7cdbd3-e2f3-4edb-b47a-f315abef40ca',
 '26143c74-52d3-484a-94f8-ea499223df99',
 '38ad7d6e-af4d-4f0b-9e4a-b9d83aecdea4',
 'ec61dcaf-49a7-4c27-b25d-cd7831520cc5',
 '1114be06-a423-44d5-a11f-7cd942e457e3',
 '087ba610-15e7-4450-879a-c2f31abe77e8',
 'aae6d5f3-5591-400a-b49a-eddda03657e9',
 '0634a3cc-fa04-4f95-895a-490a13862f49',
 'dff73645-5d81-4f48-94e1-52587876c3c4',
 '97941e5f-4b8b-4df9-92ed-2cbe5cba6ed6',
 '1cf0a892-a9eb-4250-9c4a-271300659500',
 '85209262-1220-

## Init the pinecone, and check related pinecone index info

In [59]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_KEY)
# list all indexes
pc.list_indexes()

# list assign index
pc.describe_index(PINECONE_ENV)


{'dimension': 1536,
 'host': 'openaiapi-qjp3x74.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'openaiapi',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

## Create Index and set vector data

In [47]:
# from pinecone import Pinecone, ServerlessSpec

# pc = Pinecone(
#     api_key = PINECONE_KEY,
# )

# index_name = 'openaitest'

# # use create_index method for create, need to get the dimension and index name
# # can only create one time...
# pc.create_index(
#     name=index_name,
#     dimension=1536,
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud='aws', 
#         region='us-east-1'
#     ) 
# ) 

In [49]:
# vs2 = PineconeVectorStore(embedding=op_embed, index_name=index_name)

In [50]:
# need to have the pure texts that dealt by split_text 
# pinevec = vs2.from_texts(
#     texts=text_chunks, 
#     index_name=index_name,
#     embedding=op_embed
# )

TypeError: expected string or buffer

## compare the similarity

In [52]:
query = "Who is Ketanji Brown Jackson?"
vs.similarity_search(query)

[Document(page_content='[61] Joseph Redmon, Santosh Divvala, Ross Girshick, and Ali\nFarhadi. You only look once: Uniﬁed, real-time object de-\ntection. In Proceedings of the IEEE/CVF Conference on\nComputer Vision and Pattern Recognition (CVPR) , pages\n779–788, 2016. 2, 5\n[62] Joseph Redmon and Ali Farhadi. YOLO9000: better, faster,\nstronger. In Proceedings of the IEEE/CVF Conference on\nComputer Vision and Pattern Recognition (CVPR) , pages\n7263–7271, 2017. 2\n[63] Joseph Redmon and Ali Farhadi. YOLOv3: An incremental', metadata={'page': 13.0, 'source': 'pdf/yolov7.pdf'}),
 Document(page_content='[73] Mingxing Tan and Quoc Le. EfﬁcientNetv2: Smaller mod-\nels and faster training. In International Conference on Ma-\nchine Learning (ICML) , pages 10096–10106, 2021. 2\n[74] Mingxing Tan, Ruoming Pang, and Quoc V Le. Efﬁcient-\nDet: Scalable and efﬁcient object detection. In Proceedings\nof the IEEE/CVF Conference on Computer Vision and Pat-\ntern Recognition (CVPR) , pages 10781–107

In [62]:
query_text = 'YPLOv7 outperforms which models'
vs.similarity_search(query_text) 

[Document(page_content='is 127 fps faster and 10.7% more accurate on AP. In ad-\ndition, YOLOv7 has 51.4% AP at frame rate of 161 fps,\nwhile PPYOLOE-L with the same AP has only 78 fps frame\nrate. In terms of parameter usage, YOLOv7 is 41% less than\nPPYOLOE-L. If we compare YOLOv7-X with 114 fps in-\nference speed to YOLOv5-L (r6.1) with 99 fps inference\nspeed, YOLOv7-X can improve AP by 3.9%. If YOLOv7-\nX is compared with YOLOv5-X (r6.1) of similar scale, the\ninference speed of YOLOv7-X is 31 fps faster. In addi-', metadata={'page': 6.0, 'source': 'pdf/yolov7.pdf'}),
 Document(page_content='(56 FPS V100, 55.9% AP) outperforms both transformer-\nbased detector SWIN-L Cascade-Mask R-CNN (9.2 FPS\nA100, 53.9% AP) by 509% in speed and 2% in accuracy,and convolutional-based detector ConvNeXt-XL Cascade-\nMask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed\nand 0.7% AP in accuracy, as well as YOLOv7 outperforms:\nYOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, De-\nformable DETR, DINO-5sca

In [63]:
# find top 3 query
query = "YPLOv7 outperforms which models?"  
vs.similarity_search(  
    query,  # our search query  
    k=3  # return 3 most relevant docs  
) 

[Document(page_content='is 127 fps faster and 10.7% more accurate on AP. In ad-\ndition, YOLOv7 has 51.4% AP at frame rate of 161 fps,\nwhile PPYOLOE-L with the same AP has only 78 fps frame\nrate. In terms of parameter usage, YOLOv7 is 41% less than\nPPYOLOE-L. If we compare YOLOv7-X with 114 fps in-\nference speed to YOLOv5-L (r6.1) with 99 fps inference\nspeed, YOLOv7-X can improve AP by 3.9%. If YOLOv7-\nX is compared with YOLOv5-X (r6.1) of similar scale, the\ninference speed of YOLOv7-X is 31 fps faster. In addi-', metadata={'page': 6.0, 'source': 'pdf/yolov7.pdf'}),
 Document(page_content='(56 FPS V100, 55.9% AP) outperforms both transformer-\nbased detector SWIN-L Cascade-Mask R-CNN (9.2 FPS\nA100, 53.9% AP) by 509% in speed and 2% in accuracy,and convolutional-based detector ConvNeXt-XL Cascade-\nMask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed\nand 0.7% AP in accuracy, as well as YOLOv7 outperforms:\nYOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, De-\nformable DETR, DINO-5sca

## for QA with llm in embedding

In [71]:
llm = ChatOpenAI(
    openai_api_key=OPENAI_KEY,
    model_name='gpt-3.5-turbo',  
    temperature=0.0 
)

In [72]:
query='YOLOv7 outperforms which models'

In [74]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=vs.as_retriever() 
)

In [76]:
qa.invoke(query)

{'query': 'YOLOv7 outperforms which models',
 'result': 'YOLOv7 outperforms YOLOR-E6, YOLOv7-E6, YOLOv7-D6, YOLOv7-E6E, YOLOv5-L6 (r6.1), YOLOX-X, PPYOLOE-X, YOLOv5-X6 (r6.1), YOLOv7-E6E, YOLOv5-X (r6.1), YOLOR-CSP, YOLOR-CSP-X, YOLOv7-tiny-SiLU, YOLOv7, and YOLOv7-X models.'}

In [None]:
# before

import sys
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa.invoke({'query': user_input})
  print(f"Answer: {result['result']}")

In [1]:
# Simple input modules

# import ipywidgets as widgets
# from IPython.display import display

# def process_input(input_value):
#     # deal with user inpput
#     print("User input:", input_value)

# # create the text blank
# text_input = widgets.Text(description="Input:")

# # if blank change, it will detect
# text_input.observe(lambda change: process_input(change.new), names='value')

# # display the text below
# display(text_input)


Text(value='', description='Input:')

User input: h
User input: he
User input: hel
User input: hell
User input: hello


## Delete the index

In [None]:
# vs.delete_index(index_name)