# Pinecone

In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [14]:
from pinecone import Pinecone

pc = Pinecone() #automaticall loads key from .env
# pc = Pinecone(api_key="PINECONE_API_KEY") # if key not entereed in .env
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'churchill-speech-7p5v55i.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'churchill-speech',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Pinecone Indexes

In [15]:
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'churchill-speech-7p5v55i.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'churchill-speech',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [16]:
from pinecone import PodSpec
index_name = "langchain"

#Create Pinecone index
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}...')
    pc.create_index(
        name=index_name, 
        dimension=1536,
        metric="cosine",
        spec=PodSpec(
            environment='gcp-starter'
        )
    )
    print('Index created')
else:
    print(f'Index {index_name} already exists')


Creating index langchain...


ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '73203dc7f03980fe56064426cd15f365', 'Date': 'Mon, 19 Feb 2024 16:01:16 GMT', 'Server': 'Google Frontend', 'Content-Length': '101', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"FORBIDDEN","message":"Projects only support a single Starter index."},"status":403}


### Delete Index

In [None]:
index_name="langchain"
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name}...')
    pc.delete_index(index_name)
    print('Index deleted')
else:
    print(f'Index {index_name} does not exist')

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

## Working with Vectors

In [None]:
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
#print(vectors)
ids=list('abcde')

index_name="langchain"
index = pc.Index(index_name)

index.upsert(vectors=zip(ids, vectors))


In [None]:
# updating vectors

index.upsert(vectors=[('c', [0.5] * 1536)])

In [None]:
# fetching vectors

index.fetch(ids=['c', 'd'])

In [None]:
from dotenv import load_dotenv
load_dotenv()

from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('resources/churchill_speech.txt') as f:
    winston_churchill = f.read()

textsplitter = RecursiveCharacterTextSplitter(
     chunk_size=100,
     chunk_overlap=20,
     length_function=len
)


In [None]:
chunks = textsplitter.create_documents([winston_churchill])
print(chunks[10].page_content)

# number of chunks
print(f'You have {len(chunks)}')

penetration were realized and when a new French Generalissimo, General Weygand, assumed
You have 300


In [None]:
def embeddings_costs(texts):
    import tiktoken
    #enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    #enc = tiktoken.encoding_for_model('text-embedding-3-large')

    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Total Embeddings cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
print(embeddings_costs(chunks))


Total tokens: 4820
Total Embeddings cost in USD: 0.001928
None


In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

[-0.044567573656022125,
 -0.0378875395789288,
 -0.0029496059181736843,
 -0.007993097388439452,
 0.015743980127830393,
 0.022589743056483196,
 -0.028581378879143218,
 -0.009650358895679658,
 0.0010493331318405562,
 0.007336567129750246,
 0.007789127035119314,
 0.0327882728296835,
 0.00741305618686828,
 -0.011696439194526596,
 0.006374081108250191,
 -0.005386098268716172,
 0.013168851797818924,
 -0.0024986397385547204,
 0.013589540447814895,
 -0.01096341941305807,
 -0.008171572320709483,
 -0.026847628780446265,
 0.029626728860761727,
 -0.0038658801793480554,
 -0.014456415497163373,
 -0.018523080208145856,
 0.010835938116855967,
 -0.018612315811635728,
 0.00305477831350332,
 -0.014341682144316964,
 0.007081604071684753,
 -0.008560391112316214,
 -0.01650886976768816,
 0.005150257591345508,
 -0.01833185733252013,
 -0.023851811800438823,
 -0.022373024294146077,
 -0.008745239084941521,
 0.02267898052261821,
 -0.012671672693721062,
 0.013615037265848859,
 0.004605273699663786,
 0.0087516130566

## Inserting the Embeddings into a Pineconde Index

In [None]:
import pinecone
from langchain_community.vectorstores import Pinecone 
pc = pinecone.Pinecone()

In [None]:
for i in pc.list_indexes().names():
    print('Deleting all indexes...', end='')
    pc.delete_index(i)
    print('Done')


Deleting all indexes...Done


In [None]:
index_name = "churchill-speech"
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}...')
    pc.create_index(
        name=index_name, 
        dimension=1536,
        metric="cosine",
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Index created')

Creating index churchill-speech...
Index created


In [None]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

#### Asking Question (Similarity Search)

In [17]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940')]


In [18]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
--------------------------------------------------


In [20]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [38]:
#query = 'Where should we fight?'
#query = 'Who was king of Belgium'\
query = "What about the French armies?"
answer = chain.run(query)
print(answer)


The French armies were involved in the conflict and were holding territory in the context provided. Additionally, the French Army was mentioned as being created to advance across the Somme in strength.
