In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./files/winston_churcill_speech.txt') as f:
    churchill_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [5]:
chunks = text_splitter.create_documents([churchill_speech])

In [7]:
chunks[0].page_content

'We Shall Fight on the Beaches\nJune 4, 1940\nHouse of Commons'

### Embedding Cost

In [8]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 4751
Embedding Cost in USD: 0.001900


#### Vectorizing the text chunks

In [15]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [16]:
# embedding the first chunks
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.04373212028041374, -0.024439297105619335, -0.019667816504914604, -0.014172206485361315, 0.012122667133783013, 0.01903420419474474, -0.01299549990502291, -0.007208946094050967, 0.00033882855914835517, -0.010958891036785776, 0.002561923723811566, 0.02315914386458369, 0.0015921103484523118, -0.022499670587731484, -0.007092568484351243, 0.004160499430349791, 0.01311187704906132, -0.015400636861709656, 0.014973918804256898, 0.0034880954373257595, -0.0005952229586901847, -0.023327245735954658, 0.026740988332931467, 0.006578567801700334, 0.0033846487766285194, -0.02262897914643369, 0.014198068383366283, -0.020262635502415707, 0.005097985549618165, -0.009678737795750129, 0.013797212223918491, -0.019913501276332596, -0.02147813446410025, -0.00146199375095661, -0.028008211090625942, -0.0250729094157892, -0.02468498373968258, -0.0013528896545516229, 0.020844522153930387, -0.008618408359450134, 0.007622733202501241, 0.004680965820332598, 0.013926520782620699, 0.0027962952536286596, -0.03020645

## Inserting the Embeddings into a Pinecone Index

In [11]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment=os.environ.get('PINECONE_ENV')
)

  from tqdm.autonotebook import tqdm


In [12]:
# deleting all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes...', end='')
    pinecone.delete_index(i)
    print('Done.')

Deleting all indexes...Done.


In [14]:
# creating new index
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done')

Creating index churchill-speech ...
Done


In [18]:
# inserting vectors into pinecone index
vector_store = Pinecone.from_documents(
    chunks,
    embeddings,
    index_name=index_name
)

## Querying the document

In [19]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='We Shall Fight on the Beaches\nJune 4, 1940\nHouse of Commons'), Document(page_content='landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we'), Document(page_content='which the British and French Armies fought.'), Document(page_content='seas and oceans, we shall fight with growing confidence and growing strength in the air, we shall')]


In [20]:
for r in result:
    print(r.page_content)
    print( '-' * 50)

We Shall Fight on the Beaches
June 4, 1940
House of Commons
--------------------------------------------------
landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we
--------------------------------------------------
which the British and French Armies fought.
--------------------------------------------------
seas and oceans, we shall fight with growing confidence and growing strength in the air, we shall
--------------------------------------------------


#### querying an getting a human answer

In [21]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [31]:
# query = 'Where should we fight?'
query = 'Who was the king of Belgium at that time?'
answer = chain.run(query)

In [32]:
print(answer)

The king of Belgium at that time would have been King Leopold III.
