In [1]:
import pinecone
import os
from dotenv import load_dotenv, find_dotenv

from openai import OpenAI
import tiktoken
import chromadb
import langchain
from langchain import hub
from langchain_chroma import Chroma
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [2]:


pdf_folder_path = r"/Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide"
chunk_size_len = 2000
chunk_overlap_len = 50
email_text_chunks = []
email_metadata = []



for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        rag_file_path =os.path.join(pdf_folder_path, filename)
        print(f"Processing file: {rag_file_path}")
        loader = PyPDFLoader(file_path=rag_file_path)
        document = loader.load()
        text_splitter_char = CharacterTextSplitter(chunk_size=chunk_size_len, chunk_overlap=chunk_overlap_len, separator="\n")
        split_documents = text_splitter_char.split_documents(document)
        print(f"Number of chunks:---- {len(split_documents)}") 
        doc_tag = filename.split('.')[0]
        metadatas = [{"section": doc_tag} for _ in split_documents]     
        print(f" metadata:---- {len(metadatas)} {metadatas}")
        email_text_chunks.extend(split_documents)
        email_metadata.extend(metadatas) 

print(f"Total chunks: {len(email_text_chunks)}")
print(f"First chunk metadata: {email_metadata[0]}")

### tokenization function counting of texts
def num_tokens_from_string(input_text: str, encoding_name: str) -> str:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(input_text))
    return num_tokens

Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/hiring_manager_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'hiring_manager_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/email_guide.pdf
Number of chunks:---- 2
 metadata:---- 2 [{'section': 'email_guide'}, {'section': 'email_guide'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/recruiter_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'recruiter_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/senioremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'senioremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/peeremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'peeremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/base_email_template.pdf
Numb

In [3]:
## embedding function using OPENAI-EMBEDDING-3-LARGE

load_dotenv()
token =os.getenv("GITHUB_API_TOKEN")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

endpoint = "https://models.inference.ai.azure.com"
model_name="gpt-4o-mini"
#embedding_model = "text-embedding-3-large"


In [4]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY"))

    # Now do stuff
if 'my_index' not in pc.list_indexes().names():
    pc.create_index(
        name='email', 
        dimension=3072,  # Dimension for text-embedding-3-large
        metric='euclidean',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


In [5]:
# Initialize index client
index = pc.Index(name='email')

# View index stats
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'metric': 'euclidean',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [6]:
embedding_model=OpenAIEmbeddings(
            model="text-embedding-3-large",
            openai_api_base=endpoint,
            openai_api_key=token
        )

In [7]:
vectors = []
for i, doc in enumerate(split_documents):
    text = doc.page_content
    embedding = embedding_model.embed_query(text)
    vectors.append({
        "id": f"doc-{i}",
        "values": embedding,
        "metadata": {"text": text}
    })

In [8]:
vectors = []
for i, (doc, meta) in enumerate(zip(email_text_chunks, email_metadata)):
    text = doc.page_content
    embedding = embedding_model.embed_query(text)
    combined_meta = dict(meta)  # Copy to avoid modifying original
    combined_meta["text"] = text
    vectors.append({
        "id": f"doc-{i}",
        "values": embedding,
        "metadata": combined_meta
    })

In [9]:
index.upsert(vectors=vectors, namespace="email_guide")

{'upserted_count': 7}

In [25]:
# Example similarity search
query = "How to write a referral request email to manager for applied job?"
query_embedding = embedding_model.embed_query(query)

In [27]:
## Search with a dense vector
searchresults = index.query(
    namespace="email_guide",
    vector=query_embedding,
    top_k=2,
    include_metadata=True
)

In [28]:
print("Search Results:, ", searchresults)

Search Results:,  {'matches': [{'id': 'doc-0',
              'metadata': {'section': 'hiring_manager_template',
                           'text': 'Hiring  Manager  –  Decision  Maker  \n'
                                   '●  Goal :  Show  value  to  land  an  '
                                   'interview.  ●  Context :  Hiring  '
                                   'managers  own  the  role  and  care  '
                                   'about  team  fit  and  problem-solving.  \n'
                                   'The\n'
                                   ' \n'
                                   'goal\n'
                                   ' \n'
                                   'is\n'
                                   ' \n'
                                   'to\n'
                                   ' \n'
                                   'subtly\n'
                                   ' \n'
                                   'pitch\n'
                                   ' \n'

In [32]:
total_tokens_retrived=0
for match in searchresults['matches']:
    print("Score:", match['score'])
    print("Metadata:", match['metadata'])
    #print("Text:", match['metadata'].get('text', 'No text found'))
    print("-" * 40)
    text = match['metadata'].get('text', 'No text found')
    total_tokens_retrived+=num_tokens_from_string(text, encoding_name="cl100k_base")
total_tokens_retrived

Score: 0.99161911
Metadata: {'section': 'hiring_manager_template', 'text': 'Hiring  Manager  –  Decision  Maker  \n●  Goal :  Show  value  to  land  an  interview.  ●  Context :  Hiring  managers  own  the  role  and  care  about  team  fit  and  problem-solving.  \nThe\n \ngoal\n \nis\n \nto\n \nsubtly\n \npitch\n \nyour\n \nvalue\n \nand\n \nget\n \non\n \ntheir\n \nradar.\n ●  Tone :  Professional,  solution-focused,  and  collaborative—respecting  their  authority  while  \nshowing\n \ninitiative.\n ●  Customizations :  •  Subject:  “Thoughts  on  Your  [Job  Title]  Opening”  •  Opener:  \nReferences\n \ntheir\n \nteam\n \nor\n \nrole\n \ndirectly.\n \n•\n \nPitch:\n \nTies\n \nyour\n \nwork\n \nto\n \ntheir\n \nlikely\n \nneeds.\n \n•\n \nRequest:\n \nPositions\n \nyou\n \nas\n \na\n \npotential\n \nteam\n \nasset.\n ●  Why  It  Works :  Ties  skills  to  their  needs,  respects  their  authority.  \nExample :  \nSubject:  Thoughts  on  Your  Data  Scientist  Opening  \nHi  John,

1012