In [None]:
%pip install langchain pinecone-client pypdf openai tiktoken

In [None]:
%pip show langchain

# Imports and Environment

In [None]:
import os
from dotenv import load_dotenv # type: ignore

In [None]:
load_dotenv()

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader # type: ignore
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore
from langchain.embeddings import OpenAIEmbeddings # type: ignore
from langchain.llms import OpenAI # type: ignore
from langchain.vectorstores import Pinecone# type: ignore
from langchain.chains import RetrievalQA# type: ignore
from langchain.prompts import PromptTemplate# type: ignore

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

# Load and Chunks Doucment

In [None]:
loader = PyPDFDirectoryLoader("pdfs")

In [None]:
data = loader.load()

In [None]:
data[1:5]

In [None]:
print(f"You have {len(data)} documents") 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 480, chunk_overlap=20, 
                                               separators=["\n\n", "\n", " ",".",",",""] ) 

# default separator list of ["\n\n", "\n", " ", ""] 
#can cause words to be split between chunks

    #separators=[
    #     "\n\n",
    #     "\n",
    #     " ",
    #     ".",
    #     ",",
    #     "\u200B",  # Zero-width space
    #     "\uff0c",  # Fullwidth comma
    #     "\u3001",  # Ideographic comma
    #     "\uff0e",  # Fullwidth full stop
    #     "\u3002",  # Ideographic full stop
    #     "",
    # ]

In [None]:
text_chunks = text_splitter.split_documents(data)

In [None]:
text_chunks[1:5]

In [None]:
print(f"You have {len(text_chunks)} chunks") 

# Getting started with Embeddings

In [None]:
from langchain_community.embeddings import AzureOpenAIEmbeddings  # type: ignore

In [None]:
#OPENAI_API_KEY = "sk-...."
#os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
#embedding = OpenAIEmbeddings()

In [None]:
azure_embeddings = AzureOpenAIEmbeddings(
    deployment="embeddings",
    model="text-embedding-ada-002",
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_type="azure",
)
embedding = azure_embeddings

In [None]:
testEmbed = embedding.embed_query("Hello")

In [None]:
testEmbed

In [None]:
print(f"You have {len(testEmbed)} dimensions")

# Setting up Pinecone

In [None]:
import pinecone # type: ignore

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_ENV = os.environ.get('PINECONE_ENV')

In [None]:
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY) 

In [None]:
pinecone.Pinecone(api_key = PINECONE_API_KEY, environment = PINECONE_ENV)
index_name = "salesvector"
host = "https://salesvector-zenjj4f.svc.gcp-starter.pinecone.io"

In [None]:
index = pinecone.Index(index_name, host)

# Create Embeddings for each Text Chunks

In [None]:
#docsearch = pinecone.Pinecone.from_texts([t.page_content for t in text_chunks], embedding, index_name= index_name)

In [None]:
index = pc.Index(index_name)

In [None]:
for i, t in zip(range(len(text_chunks)), text_chunks):
   query_result = embedding.embed_query(t.page_content)
   index.upsert(
   vectors=[
        {
            "id": str(i),  # Convert i to a string
            "values": query_result, 
            "metadata": {"text":str(text_chunks[i].page_content)} # meta data as dic
        }
    ],
    namespace="real" 
)

- range(len(text_chunks)): Generates a sequence of numbers from 0 to the length of the text_chunks list minus 1. This sequence represents the indices of elements in the text_chunks list.

- zip(range(len(text_chunks)), text_chunks): Combines the generated sequence of indices with the elements of the text_chunks list. This creates pairs where the first element of each pair is the index and the second element is the corresponding element from the text_chunks list.

- for i, t in ...: Iterates over each pair generated by zip. In each iteration:

- i represents the index of the current element in the text_chunks list.
- t represents the corresponding element from the text_chunks list.


**Allows you to iterate over each element in the text_chunks list along with its index, enabling you to perform operations or access elements based on both the index and the element itself.**

In [None]:
index.describe_index_stats()

# Query Vector Database

In [None]:
"National sales meetings"
"Signs that justify territory revision"
"SMBO process"
query = "SMBO process"

In [None]:
queryEmbed = embedding.embed_query(query)

In [None]:
queryEmbed

In [None]:
docs = index.query(vector=queryEmbed, top_k=6, namespace='real')

In [None]:
docs

In [None]:
from pprint import pprint
for element in docs["matches"]:
    pprint(text_chunks[int(element["id"])].page_content , width = 120)

## Consideraions:
- Type of data
- Search frequency : *Frequency of updates vs Frequency of Searches*
- Chunk size : *Degree of detail vs summary*
- Embedding Model - Dimensions: *GTE-Base (Graft Default), GTE-Large,
,GTE-Small ,E5-Small, E5-Base*
- Tokens and Costs

## Excercises:
Experiment with:
- chunk sizes
- Separators
- OpenSource Embedding Models: 
- Semantic Search