In [2]:
from abc import ABC, abstractmethod

from multiprocessing import Pool

from enum import Enum
from pathlib import Path
import shutil
import requests
import json

import pypdf
from kaggle.api.kaggle_api_extended import KaggleApi
import kagglehub

import ollama

In [3]:
kg_api = KaggleApi()
kg_api.authenticate()

In [4]:
DatasetSavePath = Path("../Datasets/Source1/raw/data")

In [5]:
# DatasetTempPath = kagglehub.dataset_download("snehaanbhawal/resume-dataset")
# shutil.move(DatasetTempPath, DatasetSavePath)

# Core Functionality (Required)
## 1. Resume Ingestion and Processing
- Index multiple resumes: Ingest and process multiple resume files (PDF and plain text).
- Parse content: Extract text from resumes.
- Chunk text: Split parsed text into meaningful chunks (100–500 tokens with overlap).
- Generate embeddings: Produce vector embeddings for each chunk using a free-tier
model.

In [6]:
datasetPath = DatasetSavePath

In [7]:
class FileType(Enum):
    NAF = 0
    TXT = 1
    PDF = 2

In [8]:
type_suffix_conversion = {
    FileType.NAF: "",
    FileType.TXT: ".txt",
    FileType.PDF: ".pdf",
}
# To convert from suffix to FileType (excluding empty string):
suffix_type_conversion = {v: k for k, v in type_suffix_conversion.items() if v}

In [9]:
class Chunk:
    def __init__(self, content: str = ""):
        self.content: str = content
        self.embeddings: list[float] = []

    def set_content(self, content: str) -> None:
        self.content = content
    def set_embeddings(self, embeddings: list[float]) -> None:
        self.embeddings = embeddings

    def get_content(self) -> str:
        return self.content
    def get_embeddings(self) -> list[float]:
        return self.embeddings

In [10]:
class Chunks:
    def __init__(self, chunks: list[Chunk] = None):
        self.chunks: list[Chunk] = chunks if chunks is not None else []

    def add_chunk(self, chunk: Chunk) -> None:
        self.chunks.append(chunk)

    def get_chunks(self) -> list[Chunk]:
        return self.chunks

    def get_chunks_contents(self) -> list[str]:
        return [chunk.get_content() for chunk in self.chunks]
    
    def get_chunks_embeddings(self) -> list[list[float]]:
        return [chunk.get_embeddings() for chunk in self.chunks]

In [11]:
class File:
    def __init__(self, file_path: Path) -> None:
        self.file_path = file_path
        self.content: str = ""
        self.chunks: list[Chunk] = []

    def __getitem__(self, idx: int) -> str:
        if idx >= len(self.content):
            raise IndexError("Index out of range")
        return self.content[idx]
    
    def __len__(self) -> int:
        return len(self.content)
    
    def __iter__(self):
        return iter(self.content)

In [12]:
class Files:
    def __init__(self, files_type: FileType = FileType.NAF) -> None:
        self.files_type: FileType = files_type
        self.files: list[File] = []
        
    def __getitem__(self, index: int) -> File:
        return self.files[index]
    
    def __len__(self) -> int:
        return len(self.files)
    
    def __iter__(self):
        return iter(self.files)
    
    def get_type(self) -> FileType:
        return self.files_type
    
    def get_file(self, index: int) -> File:
        return self.files[index]
    
    def get_files(self) -> list[File]:
        return self.files
    
    def add_file(self, file: File) -> None:
        self.files.append(file)

    def add_files(self, files: list[File]) -> None:
        self.files.extend(files)

    

In [13]:
class AbstractReaderType(ABC):
    @abstractmethod
    def get_type(self) -> FileType:
        raise NotImplementedError("This method should be overridden by subclasses")
    @abstractmethod
    def read_files(self) -> Files:
        raise NotImplementedError("This method should be overridden by subclasses")
    
class ReadPdf(AbstractReaderType):
    def __init__(self) -> None:
        self.files: Files = Files(FileType.PDF)

    def get_type(self) -> FileType:
        return FileType.PDF
    
    def read_files(self, path: Path) -> Files:
        self.files = Files(FileType.PDF)
        for pdf_file in path.iterdir():
            if pdf_file.is_file() and pdf_file.suffix == type_suffix_conversion[FileType.PDF]:
                reader = pypdf.PdfReader(pdf_file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
            if text:
                file = File(pdf_file)
                file.content = text
                self.files.add_file(file)
        return self.files

class Reader:
    def __init__(self, dir_path: Path, Reader: AbstractReaderType) -> None:
            self.dir_path = dir_path
            self.reader = Reader

            self.content: dict[str, Files] = {}

    def read_files(self) -> None:
        for child_dir in self.dir_path.iterdir():
            # print(f"Reading files in directory: {child_dir.name}")
            if child_dir.is_file():
                continue
            self.content[child_dir.name] = self.reader.read_files(child_dir)

    def get_content(self) -> dict[str, Files]:
        return self.content

In [14]:
read_pdf = Reader(dir_path=datasetPath, Reader=ReadPdf())
read_pdf.read_files()
files = read_pdf.get_content()

In [15]:
# for key in files.keys():
#     print(f"Files in {key}: length={len(files[key])}")

In [16]:
# def save_texts_to_files(texts: dict[FileType, files:Files], out_dir: Path) -> None:
#     output_dir = out_dir
#     output_dir.mkdir(parents=True, exist_ok=True)
#     for file_type, text_list in texts.items():
#         suffix = type_suffix_conversion[file_type]
#         for idx, text in enumerate(text_list):
#             file_name = f"{file_type.name.lower()}_{idx}{suffix}"
#             with open(output_dir / file_name, "w", encoding="utf-8") as f:
#                 f.write(text)

In [17]:
# def save_files_dict_to_disk(files_dict: dict[str, Files], out_dir: Path) -> None:
# 	out_dir.mkdir(parents=True, exist_ok=True)
# 	for dir_name, files_obj in files_dict.items():
# 		dir = (out_dir / dir_name.lower())
# 		dir.mkdir(parents=True, exist_ok=True)
# 		# print(f"Saving files in directory: {dir_name}")
		
# 		for idx, file in enumerate(files_obj.get_files()):
# 			file_name = f"{dir_name.lower()}_{idx}.txt"
# 			with open(dir / file_name, "w", encoding="utf-8") as f:
# 				f.write(file.content)

# save_files_dict_to_disk(files, Path("../Datasets/Source1/processed"))

In [18]:
# def read_txt_files(texts_dir_path: Path) -> list[str]:
#     texts = []
#     for dir in texts_dir_path.iterdir():
#         for file in dir.glob("*.txt"):
#             with open(file, "r", encoding="utf-8") as f:
#                 texts.append(f.read())
#     return texts

In [19]:
# textsPath = Path("../Datasets/Source1/processed")
# texts = read_txt_files(textsPath)

In [26]:
chunks_dict = {}

In [27]:
def chunk_sliding_window(text: str, window_size: int = 150, step_size: int = 100) -> Chunks:
    chunks =  Chunks(
        [Chunk(text[i:i + window_size]) for i in range(0, len(text), step_size) if i + window_size <= len(text)]
    )
    if len(text) > 0 and (len(text) - window_size) % step_size != 0 and (len(text) - 1) % step_size != 0:
        last_start = ((len(text) - 1) // step_size) * step_size
        if last_start < len(text) - 1:
            chunks.add_chunk(Chunk(text[last_start:]))
    return chunks

In [28]:
def chunk_texts(files_dict: dict[str, Files]) -> dict[str, list[Chunks]]:
    window_size = 150
    step_size = 100
    chunks_dict: dict[str, list[Chunks]] = {}
    
    for key in files_dict.keys():
        chunks_dict[key] = []
        files_with_key = files_dict[key]
        for file in files_with_key.get_files():
            text = file.content
            chunks = chunk_sliding_window(text, window_size, step_size)
            chunks_dict[key].append(chunks)
    return chunks_dict

In [29]:
listed_chunked_texts = chunk_texts(files)
print(type(listed_chunked_texts))

<class 'dict'>


In [30]:
for key in listed_chunked_texts.keys():
    print(f"Key: {key}")

Key: ACCOUNTANT
Key: ADVOCATE
Key: AGRICULTURE
Key: APPAREL
Key: ARTS
Key: AUTOMOBILE
Key: AVIATION
Key: BANKING
Key: BPO
Key: BUSINESS-DEVELOPMENT
Key: CHEF
Key: CONSTRUCTION
Key: CONSULTANT
Key: DESIGNER
Key: DIGITAL-MEDIA
Key: ENGINEERING
Key: FINANCE
Key: FITNESS
Key: HEALTHCARE
Key: HR
Key: INFORMATION-TECHNOLOGY
Key: PUBLIC-RELATIONS
Key: SALES
Key: TEACHER


In [31]:
tst = listed_chunked_texts["ACCOUNTANT"][0].get_chunks_contents()
print(tst)

['ACCOUNTANT\nSummary\nFinancial Accountant specializing in financial planning, reporting and analysis within the Department of Defense.\nHighlights\nAccoun', 'ithin the Department of Defense.\nHighlights\nAccount reconciliations\nResults-oriented\nFinancial reporting\nCritical thinking\nAccounting operations profe', 'ting\nCritical thinking\nAccounting operations professional\nAnalysis of financial systems\nERP (Enterprise Resource Planning) software.\nExcellent facilit', 'ise Resource Planning) software.\nExcellent facilitator\nAccomplishments\nServed on a tiger team which identified and resolved General Ledger postings in', 'identified and resolved General Ledger postings in DEAMS totaling $360B in accounting adjustments. This allowed\nfor the first successful fiscal year-e', 'his allowed\nfor the first successful fiscal year-end close for 2012.\nIn collaboration with DFAS Europe, developed an automated tool that identified du', 'pe, developed an automated tool that identified dupli

In [33]:
# response = ollama.embed(model='dengcao/Qwen3-Embedding-4B:Q5_K_M', input=tst)

## 2. Vector Database Integration
- Store embeddings: Save generated embeddings in a vector database.
- Efficient retrieval: Implement top‑K similarity search for queries.

In [59]:
print("Hello :D")

Hello :D


## 3. Retrieval-Augmented Generation (RAG) Chatbot
- Job description input: Accept a text description of the job role.
- Retrieve relevant chunks: Perform vector search to find top resume chunks.
- LLM-based matching: Use an LLM to generate conversational answers about
candidate fit, citing retrieved content.
- Conversational interface: Support follow-up questions for deeper insights.

In [3]:
print("Hello :D")

Hello :D


# Optional Bonus Features
## A. Web User Interface
- Frontend (Bonus): Implement a simple web UI for file upload, job description entry, and
chat interaction.
- Tech Stack: Next.js with TypeScript
- Hosting: Vercel (Hobby/Free Tier)

In [4]:
print("Hello :D")

Hello :D


## B. SQL-Based Metadata Search
- Extract metadata: Tag resumes with structured metadata (skills, titles, experience).
- Metadata storage: Save tags in a relational database (e.g., PostgreSQL alongside
vector store).
- Metadata API: Expose an endpoint for SQL queries (e.g., SELECT * FROM
resume_metadata WHERE skills @> ARRAY['TypeScript'] AND
years_experience >= 5).

In [5]:
print("Hello :D")

Hello :D
