In [12]:
!pip install -qU \
    langchain==0.0.354 \
    openai==1.6.1 \
    datasets==2.10.1 \
    pinecone-client==3.1.0 \
    tiktoken==0.5.2

In [13]:
!pip install python-dotenv



In [18]:
import os
from langchain.chat_models import ChatOpenAI

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [15]:
def parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Adding the project path to the relative filepath
project_path = os.getcwd()  # Get the current working directory
file_path = os.path.join(project_path, 'data', 'An overview of the last 10 years of genetically engineered crop safety research.txt')
text = parse_file(file_path)



Constant-size chunking:

In [16]:
from langchain.text_splitter import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(text)





In [27]:
from pinecone import Pinecone

api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)

In [28]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [29]:
import time

index_name = 'citation-checker'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 79}},
 'total_vector_count': 79}

In [30]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [31]:
len(chunks)

79

In [32]:
res = embed_model.embed_documents(chunks)
len(res), len(res[0])

(79, 1536)

In [33]:
ids = [str(i) for i in list(range(len(res)))]
metadata = [{'chunk': s, 'index': i} for s, i in zip(chunks, ids)]
index.upsert(vectors = zip(ids, res, metadata))
index.describe_index_stats()




{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 79}},
 'total_vector_count': 79}