In [12]:
!pip install -qU \
    langchain==0.0.354 \
    openai==1.6.1 \
    datasets==2.10.1 \
    pinecone-client==3.1.0 \
    tiktoken==0.5.2

In [13]:
!pip install python-dotenv



In [18]:
import os
from langchain.chat_models import ChatOpenAI

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [15]:
def parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Adding the project path to the relative filepath
project_path = os.getcwd()  # Get the current working directory
file_path = os.path.join(project_path, 'data', 'An overview of the last 10 years of genetically engineered crop safety research.txt')
text = parse_file(file_path)



Constant-size chunking:

In [16]:
from langchain.text_splitter import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(text)





In [27]:
from pinecone import Pinecone

api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)

In [28]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [29]:
import time

index_name = 'citation-checker'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 79}},
 'total_vector_count': 79}

In [30]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [31]:
len(chunks)

79

In [32]:
res = embed_model.embed_documents(chunks)
len(res), len(res[0])

(79, 1536)

In [33]:
ids = [str(i) for i in list(range(len(res)))]
metadata = [{'chunk': s, 'index': i} for s, i in zip(chunks, ids)]
index.upsert(vectors = zip(ids, res, metadata))
index.describe_index_stats()




{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 79}},
 'total_vector_count': 79}

In [34]:
from langchain.vectorstores import Pinecone

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, "chunk"
)

  warn_deprecated(


In [43]:

vectorstore.similarity_search(query, k=3)



[Document(page_content='GE food/feed consumption\nThe scientific records grouped under this topic are numerous\nand constitute 40.5% of the GE food&feed literature, clearly\nindicating the importance of the human health issues.\n\nThe\ndistribution over the year is uniform, but a peak was observed\nin 2008, probably due to the scientific fervors that followed\nthe publication of experimental studies conducted by the\nprivate companies after 2006 (Table 1; Figure 2).\n\nAccording\nto the literature, the concerns about GE food/feed consumption that emerge from the scientific and social debates can be\nsummarized as follows: safety of the inserted transgenic DNA\nand the transcribed RNA, safety of the protein(s) encoded by\nthe transgene(s) and safety of the intended and unintended\nchange of crop composition (Dona & Arvanitoyannis, 2009;\nParrot et al., 2010).\n\nSafety of the inserted transgenic DNA and the transcribed\nRNA\nDNA.', metadata={'index': '27'}),
 Document(page_content='We h

In [48]:
def context_prompt(quote, context):
    prompt = f"""Determine if the below quote is supported by the below contents. If it is supported, directly quote the supporting content from the context. 

    Contexts:
    {context}

    Quote: {quote}"""
    return prompt

In [49]:
quote = "There is a scientific consensus[338][339][340][341] that currently available food derived from GM crops poses no greater risk to human health than conventional food,[342][343][344][345][346] but that each GM food needs to be tested on a case-by-case basis before introduction."

results = vectorstore.similarity_search(quote, k=5)
source_knowledge = "\n".join([x.page_content for x in results])


'Determine if the below quote is supported by the below contents. If it is supported, directly quote the supporting content from the context. \n\n    Contexts:\n    GE food/feed consumption\nThe scientific records grouped under this topic are numerous\nand constitute 40.5% of the GE food&feed literature, clearly\nindicating the importance of the human health issues.\n\nThe\ndistribution over the year is uniform, but a peak was observed\nin 2008, probably due to the scientific fervors that followed\nthe publication of experimental studies conducted by the\nprivate companies after 2006 (Table 1; Figure 2).\n\nAccording\nto the literature, the concerns about GE food/feed consumption that emerge from the scientific and social debates can be\nsummarized as follows: safety of the inserted transgenic DNA\nand the transcribed RNA, safety of the protein(s) encoded by\nthe transgene(s) and safety of the intended and unintended\nchange of crop composition (Dona & Arvanitoyannis, 2009;\nParrot et 

In [52]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
prompt = HumanMessage(
    content=context_prompt(quote, source_knowledge)
)
# add to messages
messages = [prompt]

res = chat(messages)

print(res.content)

The quote is supported by the content, specifically by the statement: "the scientific research conducted so far has not detected any significant hazards directly connected with the use of GE crops" and "there is no scientific evidence of toxic or allergenic effects" in relation to GM crops. This indicates that there is a consensus that currently available food derived from GM crops does not pose a greater risk to human health than conventional food, but that each GM food should be tested on a case-by-case basis.
