In [1]:
import os
from typing import List, Dict

import chromadb
from chromadb.utils import embedding_functions


In [2]:
CHROMA_DB_DIR = "chroma_career_db"
COLLECTION_NAME = "career_knowledge_base"

# Use a local embedding model (no API needed).
# You can swap this later to Together / OpenRouter embeddings if you want.
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ==============================
# LOAD CAREER DOCS
# ==============================

def load_career_docs_from_folder(folder_path: str) -> List[Dict]:
    """
    Reads simple text / markdown files from a folder.
    Each file represents one career / role description.

    Expected structure:
        data/careers/
            data_analyst.txt
            software_engineer.txt
            product_manager.txt
            ...

    Returns:
        List of dicts: { "id": str, "text": str, "metadata": dict }
    """
    docs = []

    if not os.path.isdir(folder_path):
        raise ValueError(f"Folder not found: {folder_path}")

    for filename in os.listdir(folder_path):
        if not filename.lower().endswith((".txt", ".md")):
            continue

        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read().strip()

        if not text:
            continue

        doc_id = os.path.splitext(filename)[0]

        docs.append(
            {
                "id": doc_id,
                "text": text,
                "metadata": {
                    "filename": filename,
                    "career_name": doc_id.replace("_", " ").title(),
                },
            }
        )

    if not docs:
        raise ValueError(f"No .txt/.md files found in folder: {folder_path}")

    return docs


In [4]:
# ==============================
# BUILD / LOAD VECTOR DB
# ==============================

def get_chroma_collection():
    """
    Connect to (or create) a persistent Chroma collection for career knowledge.
    """
    client = chromadb.PersistentClient(path=CHROMA_DB_DIR)

    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=embedding_fn,
    )

    return collection


def build_or_update_career_index(career_docs: List[Dict]) -> None:
    """
    Ingest / update the vector DB with career documents.

    This should be called once during setup, or whenever you update the corpus.
    """
    collection = get_chroma_collection()

    # Clear collection if you want a clean rebuild (optional)
    # collection.delete(where={})

    ids = [doc["id"] for doc in career_docs]
    texts = [doc["text"] for doc in career_docs]
    metadatas = [doc["metadata"] for doc in career_docs]

    collection.upsert(ids=ids, documents=texts, metadatas=metadatas)

    print(f"[INFO] Indexed {len(ids)} career documents into Chroma.")



In [5]:
# ==============================
# RETRIEVAL: GIVEN RESUME TEXT
# ==============================

def retrieve_relevant_careers(
    resume_text: str,
    top_k: int = 5,
) -> List[Dict]:
    """
    Core retrieval function (simple RAG):

    Input:
        resume_text: Plain text extracted from the resume (PDF parsing is upstream).
        top_k: Number of most relevant careers to return.

    Output:
        List of dicts with:
            - career_name
            - score (distance)
            - snippet / full text
            - metadata
    """
    if not resume_text or not resume_text.strip():
        raise ValueError("resume_text is empty. Make sure you parsed the PDF first.")

    collection = get_chroma_collection()

    results = collection.query(
        query_texts=[resume_text],
        n_results=top_k,
    )

    # Chroma returns dict with lists; we convert to a nicer list of dicts.
    retrieved = []
    ids = results.get("ids", [[]])[0]
    docs = results.get("documents", [[]])[0]
    metadatas = results.get("metadatas", [[]])[0]
    distances = results.get("distances", [[]])[0] if "distances" in results else [None] * len(ids)

    for idx, doc_id in enumerate(ids):
        retrieved.append(
            {
                "id": doc_id,
                "career_name": metadatas[idx].get("career_name", doc_id),
                "score": distances[idx],
                "text": docs[idx],
                "metadata": metadatas[idx],
            }
        )

    return retrieved

In [7]:
# ==============================
# EXAMPLE: HOW TO USE (for testing)
# ==============================

if __name__ == "__main__":
    # 1) Build index once (e.g., from a folder of career descriptions)
    career_folder = "data/careers"
    docs = load_career_docs_from_folder(career_folder)
    build_or_update_career_index(docs)

    # 2) Pretend we already parsed a resume PDF
    example_resume_text = """
    I am a Computer Science student with strong experience in Python, data analysis,
    machine learning, and SQL. I have worked on projects with pandas, scikit-learn,
    and Streamlit. I enjoy building dashboards and working with real-world datasets.
    """

    # 3) Retrieve top-k relevant careers from vector DB
    retrieved_careers = retrieve_relevant_careers(example_resume_text, top_k=3)

    print("\nTop matching career paths for this resume:\n")
    for r in retrieved_careers:
        print(f"- {r['career_name']} (score: {r['score']})")

[INFO] Indexed 3 career documents into Chroma.

Top matching career paths for this resume:

- Data Analyst (score: 0.8273972272872925)
- Software Engineer (score: 1.2232584953308105)
- Product Manager (score: 1.503218173980713)
