In [1]:
import os
from dotenv import load_dotenv, find_dotenv

from openai import OpenAI
import tiktoken
import chromadb
import langchain
from langchain import hub
from langchain_chroma import Chroma
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter


pdf_folder_path = r"/Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide"
chunk_size_len = 2000
chunk_overlap_len = 50
email_text_chunks = []
email_metadata = []



for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        rag_file_path =os.path.join(pdf_folder_path, filename)
        print(f"Processing file: {rag_file_path}")
        loader = PyPDFLoader(file_path=rag_file_path)
        document = loader.load()
        text_splitter_char = CharacterTextSplitter(chunk_size=chunk_size_len, chunk_overlap=chunk_overlap_len, separator="\n")
        split_documents = text_splitter_char.split_documents(document)
        print(f"Number of chunks:---- {len(split_documents)}") 
        doc_tag = filename.split('.')[0]
        metadatas = [{"section": doc_tag} for _ in split_documents]     
        print(f" metadata:---- {len(metadatas)} {metadatas}")
        email_text_chunks.extend(split_documents)
        email_metadata.extend(metadatas) 

print(f"Total chunks: {len(email_text_chunks)}")
print(f"First chunk metadata: {email_metadata[0]}")

### tokenization function counting of texts
def num_tokens_from_string(input_text: str, encoding_name: str) -> str:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(input_text))
    return num_tokens




Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/hiring_manager_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'hiring_manager_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/email_guide.pdf
Number of chunks:---- 2
 metadata:---- 2 [{'section': 'email_guide'}, {'section': 'email_guide'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/recruiter_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'recruiter_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/senioremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'senioremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/peeremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'peeremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/base_email_template.pdf
Numb

In [2]:
## embedding function using OPENAI-EMBEDDING-3-LARGE

load_dotenv()
token =os.getenv("GITHUB_API_TOKEN")

endpoint = "https://models.inference.ai.azure.com"
model_name="gpt-4o"
embedding_model = "text-embedding-3-large"
client = OpenAI(
    base_url=endpoint,
    api_key=token
)

response = client.embeddings.create(
    input=[chunk.page_content for chunk in email_text_chunks],
    model=embedding_model,
)
#print(response)
embeddings = [item.embedding for item in response.data]
#print(f"Number of embeddings: {len(embeddings)}")

# Display results
for i, item in enumerate(response.data):
    print(f"Chunk {i} metadata: {email_metadata[i]}")
    print(f"Embedding Vector (first 5 dims): {item.embedding[:5]}")



Chunk 0 metadata: {'section': 'hiring_manager_template'}
Embedding Vector (first 5 dims): [0.000710678577888757, -0.006397498305886984, -0.004142611753195524, -0.035692475736141205, -0.015235316939651966]
Chunk 1 metadata: {'section': 'email_guide'}
Embedding Vector (first 5 dims): [-0.017011938616633415, -0.031198788434267044, -0.007742120418697596, -0.04854850843548775, -0.012551678344607353]
Chunk 2 metadata: {'section': 'email_guide'}
Embedding Vector (first 5 dims): [-0.007910573855042458, -0.040177181363105774, -0.0038782358169555664, -0.01838161237537861, -0.01209108717739582]
Chunk 3 metadata: {'section': 'recruiter_template'}
Embedding Vector (first 5 dims): [-0.019944479689002037, -0.03546753153204918, -0.012463296763598919, -0.02058526501059532, -0.03879962116479874]
Chunk 4 metadata: {'section': 'senioremployee_template'}
Embedding Vector (first 5 dims): [-0.018149359151721, 0.008191158063709736, -0.01531713455915451, -0.015003359876573086, -0.015919910743832588]
Chunk 5 me

In [3]:
persist_path = "./chromadb_email_db"
chroma_client = chromadb.PersistentClient(path=persist_path)

In [4]:
collection = chroma_client.get_or_create_collection(name="git_email_rag")
collection.add(
    documents=[chunk.page_content for chunk in email_text_chunks],
    embeddings=embeddings,
    metadatas=email_metadata,
    ids=[f"doc_{i}" for i in range(len(email_text_chunks))]
)

In [5]:
import gc

doc_count = collection.count()
print(f"✅ Chroma collection populated with {doc_count} documents")

# 🔒 Prevent accidental zipping of empty DB
if doc_count == 0:
    raise Exception("❌ ERROR: No documents added to ChromaDB! Skipping zip to avoid saving empty DB.")
# --- CLOSE THE CLIENT BEFORE ZIPPING ---
del chroma_client
gc.collect()

✅ Chroma collection populated with 7 documents


426

In [6]:
!zip -r chromadb_email_db.zip chromadb_email_db/  #jupyter run

# Save the collection to disk(.py file run)
#import subprocess
#subprocess.run(["zip", "-r", "chroma_email_db.zip", "chroma_email_db/"])

  adding: chromadb_email_db/ (stored 0%)
  adding: chromadb_email_db/bc595fdf-315d-4360-9abe-3a795c428bb6/ (stored 0%)
  adding: chromadb_email_db/bc595fdf-315d-4360-9abe-3a795c428bb6/data_level0.bin (deflated 100%)
  adding: chromadb_email_db/bc595fdf-315d-4360-9abe-3a795c428bb6/length.bin (deflated 77%)
  adding: chromadb_email_db/bc595fdf-315d-4360-9abe-3a795c428bb6/link_lists.bin (stored 0%)
  adding: chromadb_email_db/bc595fdf-315d-4360-9abe-3a795c428bb6/header.bin (deflated 61%)
  adding: chromadb_email_db/chroma.sqlite3 (deflated 66%)


In [8]:
import requests
import zipfile
import io
# 🔗 GitHub raw link to your zipped ChromaDB
chroma_db_zip_url = "https://raw.githubusercontent.com/kathisnehith/Linkedin-Jobs-posts-Scraper/main/data/chroma_email_db.zip"
zip_path = "/tmp/chroma_email_db.zip"
extract_path = "/tmp/chroma_email_db"

# 🔽 Download and unzip
response = requests.get(chroma_db_zip_url)
if response.status_code == 200:
    with open(zip_path, "wb") as f:
        f.write(response.content)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("✅ ChromaDB loaded from GitHub.")
else:
    raise Exception(f"❌ Failed to download zip: {response.status_code}")

✅ ChromaDB loaded from GitHub.


In [11]:
CHROMA_ZIP_LOCAL_PATH = r"/Users/kathisnehith/Desktop/Gethire-ai/chromadb_email_db.zip"      # path to your zipped chroma db
EXTRACT_PATH = r"/tmp/chromadb_email_db"                # extract here
COLLECTION_NAME = "git_email_rag"  

In [12]:
import zipfile

with zipfile.ZipFile(CHROMA_ZIP_LOCAL_PATH, "r") as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)
print(f"✅ Extracted ChromaDB to {EXTRACT_PATH}")
chroma_client = chromadb.PersistentClient(path=EXTRACT_PATH)
available_collections = chroma_client.list_collections()
print("📦 Available collections:", [col.name for col in available_collections])

✅ Extracted ChromaDB to /tmp/chromadb_email_db
📦 Available collections: []


In [13]:
import os
print(os.listdir(EXTRACT_PATH))

['chromadb_email_db', 'chroma.sqlite3']


In [None]:
collection = chroma_client.get_collection(name=COLLECTION_NAME)
print("📊 Total documents in collection:", collection.count())

📊 Total documents in collection: 0


#### 

In [9]:

#######
## Retrieving a document from the collection
#######

user_purpose_prompt = "Can you write a professional email connecting to network?"
persona = "senior data engineer"

retrival_query = f"""{user_purpose_prompt}  {persona} """
query_response = client.embeddings.create(
    input=[retrival_query],
    model=embedding_model
)
query_vector = query_response.data[0].embedding
#print(f"Query vector (first 5 dims): {query_vector[:5]}")
results = collection.query(
    query_embeddings=[query_vector],
    n_results=2   # no of results to return
)

print("\n🔍 Top Matches:")
total_tokens = 0
for i, doc in enumerate(results["documents"][0]):
    token_count = num_tokens_from_string(doc, "o200k_base")
    print(f"\n🏎️Result {i+1}:")
    print(f"Token count: {token_count}")
    print(doc)
    print("-->> Metadata:", results["metadatas"][0][i])
    total_tokens += token_count

print(f"-->>🤖 Total token count: {total_tokens}")

# Combine all retrieved docs into one context_text string (outside the loop)
context_text = "\n\n---\n\n".join(results["documents"][0])
#print("\nCombined context_text:\n", context_text)
context_text_token=num_tokens_from_string(context_text, "o200k_base")
print(f"🤖Total tokens in combined context_text: {context_text_token}")


###
# Final prompt construction
# Openai LLM call
###



final_prompt = f""" 
{retrival_query}
==== CONTEXT START ====
{context_text}
==== CONTEXT END ====
"""

chat_response = client.chat.completions.create(
    model=model_name,  
    messages=[
        {"role": "system", "content": "You are an expert in writing emails using context data, along with the user provided query."},
        {"role": "user", "content": final_prompt}
    ],
    temperature=0.7
)
final_email_response=chat_response.choices[0].message.content
print("vvvvvvvvvv  Generated Email Response  vvvvvvvvvvv")
print("-------------------------------")
print(final_email_response)



🔍 Top Matches:

🏎️Result 1:
Token count: 575
Guided  Email  template  for  corporate.  
 
 
Introduction  
Networking  in  the  corporate  world  can  feel  daunting,  especially  when  reaching  out  about  job  
opportunities.
 
This
 
email
 
template
 
is
 
designed
 
to
 
help
 
you
 
craft
 
professional,
 
warm,
 
and
 
concise
 
messages
 
that
 
subtly
 
position
 
you
 
for
 
an
 
interview.
 
Whether
 
you’re
 
contacting
 
a
 
recruiter,
 
hiring
 
manager,
 
senior
 
employee,
 
or
 
peer,
 
this
 
flexible
 
template
 
adapts
 
to
 
their
 
role
 
and
 
your
 
goal.
 
Below,
 
you’ll
 
find
 
the
 
base
 
template,
 
step-by-step
 
guidance
 
on
 
how
 
to
 
use
 
it,
 
and
 
real-world
 
examples
 
to
 
get
 
you
 
started.
 
The  focus?  Highlight  your  fit  for  a  role  in  a  natural  way,  respect  the  recipient’s  authority,  and  
nudge
 
toward
 
a
 
conversation—all
 
while
 
keeping
 
it
 
human
 
and
 
approachable.
 
How  to  Use  This  Template   
This  t

## try-github-zip-embeed-extract-retrive-RAG


In [1]:
import requests
import zipfile
import io
import os
import chromadb
import os
from dotenv import load_dotenv, find_dotenv

from openai import OpenAI
import tiktoken

In [2]:


# 🔗 GitHub raw link to your zipped ChromaDB
chroma_db_zip_url = "https://raw.githubusercontent.com/kathisnehith/Linkedin-Jobs-posts-Scraper/main/data/chroma_email_db.zip"
zip_path = "/tmp/chroma_email_db.zip"
extract_path = "/tmp/chroma_email_db"

# 🔽 Download and unzip
response = requests.get(chroma_db_zip_url)
if response.status_code == 200:
    with open(zip_path, "wb") as f:
        f.write(response.content)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("✅ ChromaDB loaded from GitHub.")
else:
    raise Exception(f"❌ Failed to download zip: {response.status_code}")

# ✅ Load collection from extracted path
chroma_client = chromadb.PersistentClient(path=extract_path)


✅ ChromaDB loaded from GitHub.


In [3]:
collection = chroma_client.get_collection(name="test_git_email_rag")

In [5]:
print("Available collections:")
print(chroma_client.list_collections())


Available collections:
[Collection(name=test_git_email_rag)]


In [6]:
print("Total documents in collection:", collection.count())

Total documents in collection: 0


In [12]:
load_dotenv()
token =os.getenv("GITHUB_API_TOKEN")

endpoint = "https://models.inference.ai.azure.com"
model_name="gpt-4o"
embedding_model = "text-embedding-3-large"
client = OpenAI(
    base_url=endpoint,
    api_key=token
)

In [13]:
# --- Embed user query
user_purpose_prompt = "Can you write a professional email connecting to network?"
persona = "senior data engineer"

retrival_query = f"""{user_purpose_prompt}  {persona} """
query_response = client.embeddings.create(
    input=[retrival_query],
    model=embedding_model
)
query_vector = query_response.data[0].embedding

# --- Search Chroma collection
results = collection.query(
    query_embeddings=[query_vector],
    n_results=2
)

In [15]:
print(results)

{'ids': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[]], 'distances': [[]]}


In [16]:
print("Total documents in collection:", collection.count())

Total documents in collection: 0


## lanchain-chroma


In [1]:
import os
from dotenv import load_dotenv, find_dotenv

from openai import OpenAI
import tiktoken
import chromadb
import langchain
from langchain import hub
from langchain_chroma import Chroma
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

pdf_folder_path = r"/Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide"
chunk_size_len = 2000
chunk_overlap_len = 50
email_text_chunks = []
email_metadata = []



for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        rag_file_path =os.path.join(pdf_folder_path, filename)
        print(f"Processing file: {rag_file_path}")
        loader = PyPDFLoader(file_path=rag_file_path)
        document = loader.load()
        text_splitter_char = CharacterTextSplitter(chunk_size=chunk_size_len, chunk_overlap=chunk_overlap_len, separator="\n")
        split_documents = text_splitter_char.split_documents(document)
        print(f"Number of chunks:---- {len(split_documents)}") 
        doc_tag = filename.split('.')[0]
        metadatas = [{"section": doc_tag} for _ in split_documents]     
        print(f" metadata:---- {len(metadatas)} {metadatas}")
        email_text_chunks.extend(split_documents)
        email_metadata.extend(metadatas) 

print(f"Total chunks: {len(email_text_chunks)}")
print(f"First chunk metadata: {email_metadata[0]}")

### tokenization function counting of texts
def num_tokens_from_string(input_text: str, encoding_name: str) -> str:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(input_text))
    return num_tokens

Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/hiring_manager_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'hiring_manager_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/email_guide.pdf
Number of chunks:---- 2
 metadata:---- 2 [{'section': 'email_guide'}, {'section': 'email_guide'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/recruiter_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'recruiter_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/senioremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'senioremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/peeremployee_template.pdf
Number of chunks:---- 1
 metadata:---- 1 [{'section': 'peeremployee_template'}]
Processing file: /Users/kathisnehith/Desktop/Gethire-ai/email_rag_guide/base_email_template.pdf
Numb

In [None]:
## embedding function using OPENAI-EMBEDDING-3-LARGE

load_dotenv()
token =os.getenv("GITHUB_API_TOKEN")

endpoint = "https://models.inference.ai.azure.com"
model_name="gpt-4o-mini"
embedding_model = "text-embedding-3-large"


In [3]:


# Define persist directory path
persist_dir = "./chroma_persist_email_rag"

# Convert text chunks to LangChain Document objects
langchain_docs = [
    Document(page_content=chunk.page_content, metadata=email_metadata[i])
    for i, chunk in enumerate(email_text_chunks)
]

In [4]:
# Check if vectorstore already exists, else create and persist
if not os.path.exists(persist_dir):
    print("🔄 Embedding and creating persistent vectorstore...")
    vectorstore = Chroma.from_documents(
        documents=langchain_docs,
        embedding=OpenAIEmbeddings(
            model="text-embedding-3-large",
            openai_api_base=endpoint,
            openai_api_key=token
        ),
        persist_directory=persist_dir,
        collection_name="email_rag"
    )
    
else:
    print("✅ Loading vectorstore from disk...")
    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding=OpenAIEmbeddings(
            model="text-embedding-3-large",
            openai_api_base=endpoint,
            openai_api_key=token
        ),
        collection_name="email_rag"
    )

🔄 Embedding and creating persistent vectorstore...


In [5]:
user_purpose_prompt = "Can you write a professional email for enquire on job openings?"
persona = "  staffing firm representative"

retrival_query = f"""{user_purpose_prompt}  {persona} """

In [6]:
# Run similarity search with score
results_with_scores = vectorstore.similarity_search_with_score(retrival_query, k=2)

print("\n🔍 Top Matches with Scores:")
total_tokens = 0
retrieved_docs = []
for i, (doc, score) in enumerate(results_with_scores):
    token_count = num_tokens_from_string(doc.page_content, "o200k_base")
    print(f"\n🏎️ Result {i+1} (Score: {score:.4f})")
    print(f"Token count: {token_count}")
    #print(doc.page_content)
    print("-->>‼️ Metadata:", doc.metadata)
    total_tokens += token_count
    retrieved_docs.append(doc.page_content)

print(f"-->> 🤖 Total token count: {total_tokens}")

# Combine context
context_text = "\n\n---\n\n".join(retrieved_docs)
context_text_token = num_tokens_from_string(context_text, "o200k_base")
print(f"🤖 Total tokens in combined context_text: {context_text_token}")


🔍 Top Matches with Scores:

🏎️ Result 1 (Score: 0.9333)
Token count: 575
-->>‼️ Metadata: {'section': 'email_guide'}

🏎️ Result 2 (Score: 0.9941)
Token count: 514
-->>‼️ Metadata: {'section': 'hiring_manager_template'}
-->> 🤖 Total token count: 1089
🤖 Total tokens in combined context_text: 1090


In [8]:
###
# Final prompt construction
# Openai LLM call
###
client = OpenAI(
    base_url=endpoint,
    api_key=token
)

final_prompt = f""" 
{retrival_query}
==== CONTEXT START ====
{context_text}
==== CONTEXT END ====
"""
total_tokens = num_tokens_from_string(final_prompt, "o200k_base")
print(f"🤖 Total tokens in combined context_text: {total_tokens}")
chat_response = client.chat.completions.create(
    model=model_name,  
    messages=[
        {"role": "system", "content": "You are an expert in writing emails using context data, along with the user provided query."},
        {"role": "user", "content": final_prompt}
    ],
    temperature=0.7
)
final_email_response=chat_response.choices[0].message.content
print("vvvvvvvvvv  Generated Email Response  vvvvvvvvvvv")
print("-------------------------------")
print(final_email_response)
email_output_tokens=num_tokens_from_string(final_email_response, "o200k_base")
print(f"🤖 Total tokens in combined context_text: {email_output_tokens}")

🤖 Total tokens in combined context_text: 1120
vvvvvvvvvv  Generated Email Response  vvvvvvvvvvv
-------------------------------
Subject: Inquiry About Job Opportunities at [Staffing Firm Name]  

Dear [Recipient's Name],  

I hope this email finds you well! My name is [Your Name], and I’m reaching out to inquire about any current or upcoming job openings your team at [Staffing Firm Name] may have available.  

With a background in [Your Industry/Field, e.g., talent acquisition, project management, etc.] and a passion for connecting the right individuals with opportunities that align with their skills, I’ve been following the impactful work your firm does in the staffing industry.  

In my [X years] of experience, I’ve had the privilege of [Key Achievement/Experience, e.g., managing end-to-end recruitment for high-growth teams or building strategic partnerships that enhanced talent pipelines]. I believe my expertise in [Specific Skill or Area, e.g., sourcing top-tier talent, streamlinin