In [2]:
import os 
import sys 

sys.path.append(os.path.abspath("../backend/")) 

from app.services.parsing_service import ParsingService

In [3]:
ps = ParsingService(data_dir="../data/resumes", chunk_size=500, chunk_overlap=50)

In [4]:
ps.parse_directory()

{'resume_489.txt': 2,
 'resume_338.txt': 4,
 'resume_304.txt': 9,
 'resume_462.txt': 17,
 'resume_476.txt': 6,
 'resume_310.txt': 9,
 'resume_847.txt': 7,
 'resume_853.txt': 7,
 'resume_884.txt': 4,
 'resume_890.txt': 10,
 'resume_648.txt': 14,
 'resume_660.txt': 12,
 'resume_106.txt': 21,
 'resume_112.txt': 21,
 'resume_674.txt': 9,
 'resume_933.txt': 6,
 'resume_927.txt': 2,
 'resume_21.txt': 4,
 'resume_35.txt': 2,
 'resume_728.txt': 8,
 'resume_714.txt': 10,
 'resume_700.txt': 15,
 'resume_258.txt': 5,
 'resume_270.txt': 9,
 'resume_516.txt': 20,
 'resume_502.txt': 1,
 'resume_264.txt': 3,
 'resume_503.txt': 12,
 'resume_265.txt': 2,
 'resume_271.txt': 2,
 'resume_517.txt': 40,
 'resume_259.txt': 3,
 'resume_701.txt': 21,
 'resume_715.txt': 13,
 'resume_729.txt': 11,
 'resume_34.txt': 1,
 'resume_20.txt': 15,
 'resume_926.txt': 6,
 'resume_932.txt': 3,
 'resume_113.txt': 1,
 'resume_675.txt': 12,
 'resume_661.txt': 7,
 'resume_107.txt': 1,
 'resume_649.txt': 5,
 'resume_891.txt': 7

In [6]:
from app.services.parsing_service import ParsingService
from app.services.embedding_service import EmbeddingService

parser = ParsingService(data_dir="../data/resumes", chunk_size=400, chunk_overlap=50)
embedder = EmbeddingService()

# 1) get chunks per file
chunk_map = parser.parse_directory()  # Dict[str, List[Document]]

# print(chunk_map)
# 2) embed per file
stats: dict[str, int] = {}
for filename, chunks in chunk_map.items():
    if not chunks:
        stats[filename] = 0
        continue
    pairs = embedder.embed_documents(chunks)  # List[(np.ndarray, Document)]
    # You’ll pass these embeddings into your pgvector upsert next.
    stats[filename] = len(pairs)

print(stats)
# {'john_doe.pdf': 18, 'jane_doe.txt': 7, ...}


{'resume_489.txt': 2, 'resume_338.txt': 4, 'resume_304.txt': 12, 'resume_462.txt': 22, 'resume_476.txt': 7, 'resume_310.txt': 12, 'resume_847.txt': 10, 'resume_853.txt': 9, 'resume_884.txt': 4, 'resume_890.txt': 12, 'resume_648.txt': 16, 'resume_660.txt': 14, 'resume_106.txt': 27, 'resume_112.txt': 27, 'resume_674.txt': 11, 'resume_933.txt': 6, 'resume_927.txt': 3, 'resume_21.txt': 5, 'resume_35.txt': 3, 'resume_728.txt': 12, 'resume_714.txt': 13, 'resume_700.txt': 19, 'resume_258.txt': 6, 'resume_270.txt': 9, 'resume_516.txt': 23, 'resume_502.txt': 3, 'resume_264.txt': 5, 'resume_503.txt': 17, 'resume_265.txt': 2, 'resume_271.txt': 2, 'resume_517.txt': 52, 'resume_259.txt': 5, 'resume_701.txt': 25, 'resume_715.txt': 14, 'resume_729.txt': 13, 'resume_34.txt': 2, 'resume_20.txt': 19, 'resume_926.txt': 6, 'resume_932.txt': 3, 'resume_113.txt': 2, 'resume_675.txt': 14, 'resume_661.txt': 8, 'resume_107.txt': 2, 'resume_649.txt': 6, 'resume_891.txt': 7, 'resume_885.txt': 12, 'resume_852.txt

In [4]:
# !pip install -qU langchain-postgres langchain-huggingface sentence-transformers psycopg[binary]
from langchain_postgres import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
import os

# 1) Embeddings (local, CPU-friendly)
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # 384-dim. :contentReference[oaicite:2]{index=2}

# 2) Connect to your Aiven pgvector DB
CONN = os.environ["VECTOR_DB_URL"]  # e.g. postgresql://user:pass@host:port/db?sslmode=require

# 3) Create (or connect to) a vector store collection
#    By default PGVector will create needed tables if missing.
vs = PGVector(
    connection=CONN,
    collection_name="resumes",     # logical namespace
    embeddings=emb,                 # any LangChain Embeddings impl
)

# 4) Index: take your parsed chunks (LangChain Document list) and add
#    Example: chunks = ParsingService().parse_single("data/resumes/john.txt")
from app.services.parsing_service import ParsingService

parser = ParsingService()
chunks = parser.parse_single("../data/resumes/resume_1.txt")

# Ensure each chunk carries a useful source in metadata
for d in chunks:
    d.metadata.setdefault("source", "resume_1.txt")

# Add to the store (embeds under the hood)
ids = vs.add_documents(documents=chunks)
print(f"Inserted {len(ids)} chunks")

# 5) Search
docs = vs.similarity_search("keras", k=3)
for i, d in enumerate(docs, 1):
    print(f"\n[{i}] {d.metadata.get('source')} → {d.page_content[:200]}…")


Inserted 5 chunks

[1] ../data/resumes/resume_1.txt → Data Scientist - Matelabs
Skill Details 
Python- Exprience - Less than 1 year months
Statsmodels- Exprience - 12 months
AWS- Exprience - Less than 1 year months
Machine learning- Exprience - Less than…

[2] ../data/resumes/resume_1.txt → company - Matelabs
description - ML Platform for business professionals, dummies and enthusiasts.
60/A Koramangala 5th block,
Achievements/Tasks behind sukh sagar, Bengaluru,
India                    …

[3] ../data/resumes/resume_1.txt → company - Matelabs
description -…


In [7]:
docs[0].page_content

'Data Scientist - Matelabs\nSkill Details \nPython- Exprience - Less than 1 year months\nStatsmodels- Exprience - 12 months\nAWS- Exprience - Less than 1 year months\nMachine learning- Exprience - Less than 1 year months\nSklearn- Exprience - Less than 1 year months\nScipy- Exprience - Less than 1 year months\nKeras- Exprience - Less than 1 year monthsCompany Details \ncompany - Matelabs'

In [9]:
from google import genai
from dotenv import load_dotenv

# The client gets the API key from the environment variable `GEMINI_API_KEY`.

load_dotenv("../.env")
client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Explain how AI works in a few words"
)
print(response.text)

AI learns patterns from data to make decisions or predictions.


In [1]:
import os 
import sys 

sys.path.append(os.path.abspath("../backend/")) 

from app.services.resume_agent_service import ResumeAgent 
from dotenv import load_dotenv

load_dotenv("../.env")

agent = ResumeAgent()
response = agent.ask("What is the candidate's experience with machine learning?")

E0000 00:00:1759953411.098750 3154096 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [7]:
agent.ask("What is the candidate the most experienced with?")

'The candidate is most experienced with Statsmodels, with 12 months of experience [source: ../data/resumes/resume_1.txt]. Other skills like Python, AWS, Machine Learning, Sklearn, Scipy, and Keras are listed with less than 1 year of experience [source: ../data/resumes/resume_1.txt].'

In [8]:
agent.ask("Does the candidate have a github page?")

'Yes, the candidate has a GitHub page: github.com/rathorology [source: ../data/resumes/resume_1.txt].'