## LangChain and Vector Stores (PineCone) 🧘‍♂️

In [29]:
import os
from dotenv import load_dotenv, find_dotenv
from pinecone import Pinecone

In [30]:
load_dotenv(find_dotenv(), override=True)

True

In [7]:
pc = Pinecone()
pc.describe_index('langchain')

{'dimension': 3072,
 'host': 'langchain-q9rfoa1.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'langchain',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [None]:
from pinecone import PodSpec

pc.create_index(
    'langchain',
    dimension=3072,
    metric='cosine',
    spec=PodSpec(
        environment='gcp-starter'
    )
)

## Working with vectors 👨‍🔬

In [11]:
import random

vectors = [[random.random() for _ in range(3072)] for v in range(5)]

In [12]:
ids = list('abcde')
index_name = 'langchain'
index = pc.Index(index_name)

# Insert vectors into Pinecone
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [30]:
# Edit the vectors in Pinecone

index.upsert(vectors=[('c', [0.5] * 3072)])

{'upserted_count': 1}

In [31]:
# Delete vectors from Pinecone

index.delete(ids=['e'])

{}

In [32]:
query_vector = [random.random() for _ in range(3072)]
index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': 'c', 'score': 0.866708338, 'values': []},
             {'id': 'a', 'score': 0.754892945, 'values': []},
             {'id': 'd', 'score': 0.747194767, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## Namespaces 🐲

In [34]:
index = pc.Index('langchain')
import random

In [35]:
vectors = [[random.random() for _ in range(3072)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [36]:
vectors = [[random.random() for _ in range(3072)] for v in range(3)]
ids = list('xyz')
index.upsert(vectors=zip(ids, vectors), namespace='my-first-namespace')

{'upserted_count': 3}

In [37]:
vectors = [[random.random() for _ in range(3072)] for v in range(2)]
ids = list('qp')
index.upsert(vectors=zip(ids, vectors), namespace='my-second-namespace')

{'upserted_count': 2}

## Splitting and embedding texts using langChain (Similarity Search) 🤖

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [53]:
with open('text.txt', encoding='utf-8') as f:
    text = f.read()

In [54]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

In [55]:
chunks = text_splitter.create_documents([text])
print(chunks[1])

page_content='fue testigo de uno de los eventos más emocionantes y sorprendentes de su historia. Colombia, un'


## Creating embedding 🦈

In [56]:
from langchain_openai import OpenAIEmbeddings

In [57]:
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small',
    dimensions=1536
)

In [58]:
vector = embeddings.embed_query(chunks[0].page_content)

## Inserting the embeddings in a index from PineCone 🦆

In [60]:
import os
import pinecone
from langchain_community.vectorstores import Pinecone

In [61]:
pc = pinecone.Pinecone()

In [62]:
indexs = pc.list_indexes().names()
for i in indexs:
    pc.delete_index(i)

In [66]:
from pinecone import PodSpec
index_name = 'colombia-win'

In [67]:
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=PodSpec(
            environment='gcp-starter'
        )
    )
    print(f'Index {index_name} created')
    
else:
    print(f'Index {index_name} already exists')

Creating index colombia-win
Index colombia-win created


In [68]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [69]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00123,
 'namespaces': {'': {'vector_count': 123}},
 'total_vector_count': 123}

## Questions (Similarity Search) 🧙‍♂️

In [70]:
question = "Que paso despues de ganar el mundial"
result = vector_store.similarity_search(question)

for r in result:
    print(r.page_content)
    print('-' * 200)

La victoria en el Mundial tuvo repercusiones que fueron más allá del deporte. Los jugadores fueron
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
entera estalló en celebraciones mientras el equipo avanzaba a la final del Mundial por primera vez
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
La celebración no terminó con el pitido final del partido. La victoria de Colombia en el Mundial de
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
se clasificó para el Mundial, hubo una sensación de esperanza y optimismo en el país. Bajo la
-----

In [71]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

In [72]:
retriever = vector_store.as_retriever(
    seatch_type='similarity',
    search_kwargs={'k': 3}
)

In [74]:
llm = ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo")

In [75]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever
)

In [76]:
query = "Solo responde a partir de la informacion proporcionada, Cuanto quedo el partido"
answer = chain.invoke(query)
print(answer)

{'query': 'Solo responde a partir de la informacion proporcionada ,Cuanto quedo el partido', 'result': 'El partido terminó 1-0 a favor de Colombia.'}
