In [None]:
import warnings
warnings.filterwarnings('ignore') # Ignore warnings for cleaner demo code

from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
import os
import time
from langchain_community.document_loaders import PyPDFLoader
from uuid import uuid4
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import umap.umap_ as umap
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

## Creating an index in Pinecone

In [None]:
index_name = "itnightlive"
#backup: itnightdraft3

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [None]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

## Loading a PDF document of a lecture transcript into the vector db

In [None]:
loader = PyPDFLoader("ebook.pdf")
pages = loader.load()
len(pages)


## Split the document for better results

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(pages)

len(splits)

## Demo of Embeddings and their dot product

In [None]:
# Just an example of dot and cosine similarity
import numpy as np

sentence1 = "I like dogs"
sentence2 = "I like canines"
sentence3 = "The weather is cold today"

embedding1 = embeddings.embed_query(sentence1)
embedding2 = embeddings.embed_query(sentence2)
embedding3 = embeddings.embed_query(sentence3)

embedding1

In [None]:
display(Markdown(f"# {np.dot(embedding1, embedding3)}"))

## Loading and encoding the documents into the Pinecone database

In [None]:
uuids = [str(uuid4()) for _ in range(len(splits))]
vector_store.add_documents(documents=splits, ids=uuids)

## Search the vector DB

In [None]:
question = "steps to take when finding projects to build your experience"

docs_ss = vector_store.max_marginal_relevance_search(question,k=3)
print(docs_ss)
display(Markdown(f"# {docs_ss[0].page_content}"))

## Visualize

In [None]:
all_docs = vector_store.similarity_search(query="", k=len(splits))
all_ids = []
all_embeddings = []
for ids in index.list():
    all_ids.extend(ids)
for id in all_ids:
    res = index.fetch([id])
    all_embeddings.append(res.vectors[id].values)

print(all_embeddings[0])

In [None]:
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(all_embeddings)

question_embedding = embeddings.embed_query(question)

def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

projected_dataset_embeddings = project_embeddings(all_embeddings, umap_transform)
projected_question_embeddings = project_embeddings([question_embedding], umap_transform)

plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10)
plt.scatter(projected_question_embeddings[:, 0], projected_question_embeddings[:, 1], s=150, marker='X', color='r')
plt.gca().set_aspect('equal', 'datalim')
plt.title('Projected Embeddings')
plt.axis('off')

## Use an LLM to answer questions based on a database, a.k.a. "AMA with your database"

In [None]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None
)

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vector_store.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

response = qa_chain.invoke("Is Debrecen the capital of Hungary?")
display(Markdown(f"# {response}"))