In [1]:
import os
import re
import gc
from dataclasses import dataclass, field
from typing import List, Optional
import pymupdf
import pymupdf4llm
from mlx_lm import load, generate

In [2]:
from IPython.display import display, Markdown

In [3]:
# path to demo files
repo_path = os.path.dirname(os.getcwd())
cvs_path = os.path.join(repo_path, 'cvs')
cvs = [os.path.join(cvs_path, file) for file in os.listdir(cvs_path) if file.endswith('.pdf')]

In [4]:
@dataclass
class Getter:
    """
    A class for processing PDF files in a specified directory and converting them to markdown format.

    Attributes
    ----------
    directory : str
        Path to the directory containing PDF files.
    save_to_file : bool, optional
        If True, saves the converted markdown to a file. Defaults to False.
    current_index : int
        Tracks the index of the current file being processed. Initialized to 0.
    files : List[str]
        List of PDF files in the directory. Initialized during object creation.
    markdown : Optional[str]
        Holds the markdown representation of the last processed file. Defaults to None.

    Methods
    -------
    __post_init__():
        Initializes the list of PDF files in the directory. Raises FileNotFoundError if no PDF files are found.
    
    get_cv(path: str) -> str:
        Converts a PDF file at the specified path to markdown format.
    
    get_next() -> Optional[str]:
        Processes the next PDF file in the directory and converts it to markdown format.
        If `save_to_file` is True, saves the markdown to a file. Returns the markdown or None if no files remain.
    
    reset():
        Resets the processing index to the beginning and clears the last processed markdown.
    """

    directory: str
    save_to_file: bool = False
    current_index: int = field(init=False, default=0)
    files: List[str] = field(init=False)
    markdown: Optional[str] = field(init=False, default=None)

    def __post_init__(self):
        """
        Initializes the list of PDF files in the directory. Raises a FileNotFoundError
        if no PDF files are found in the specified directory.

        Raises
        ------
        FileNotFoundError
            If no PDF files are found in the specified directory.
        """
        self.files = [file for file in os.listdir(self.directory) if file.endswith('.pdf')]
        if not self.files:
            raise FileNotFoundError("No PDF files found in the specified directory.")

    def get_cv(self, path: str) -> str:
        """
        Converts a PDF file at the specified path to markdown format.

        Parameters
        ----------
        path : str
            Path to the PDF file to be converted.

        Returns
        -------
        str
            The markdown representation of the PDF file.
        """
        self.markdown = pymupdf4llm.to_markdown(path, show_progress=False)
        return self.markdown
    
    def get_next(self) -> Optional[str]:
        """
        Processes the next PDF file in the directory and converts it to markdown format.
        If `save_to_file` is True, saves the markdown to a file.

        Returns
        -------
        Optional[str]
            The markdown representation of the next PDF file, or None if no files remain.
        """
        if self.current_index >= len(self.files):
            return None
        
        current_file_path = os.path.join(self.directory, self.files[self.current_index])
        self.current_index += 1
        
        self.markdown = self.get_cv(current_file_path)
        
        if self.save_to_file:
            md_file_path = os.path.splitext(current_file_path)[0] + ".md"
            with open(md_file_path, 'w', encoding='utf-8') as md_file:
                md_file.write(self.markdown)
        
        return self.markdown

    def reset(self):
        """
        Resets the processing index to the beginning and clears the last processed markdown.

        Returns
        -------
        None
        """
        self.current_index = 0
        self.markdown = None

In [5]:
@dataclass
class ResumeCleaner:
    """
    Cleans a Markdown-formatted resume into a flat, readable plain text format
    resembling the structure of the training data.
    """

    def clean(self, text) -> str:

        # 1. Remove markdown headings (e.g., #, ##, ###)
        text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)

        # 2. Remove markdown links: [text](url) → text
        text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)

        # 3. Remove standalone markdown list markers (*, -, +)
        text = re.sub(r"^[\*\-\+]\s*", "", text, flags=re.MULTILINE)

        # 4. Collapse multiple newlines into one
        text = re.sub(r"\n{2,}", "\n", text)

        # 5. Fix email and phone numbers with weird formats
        text = re.sub(r"\s?[\[\(]mailto:([^\)\]]+)[\)\]]", r"\1", text)
        text = re.sub(r"\s+", " ", text)  # Normalize whitespace

        # 6. Remove repeated name sections (e.g., "# John Doe" twice)
        lines = text.splitlines()
        seen_lines = set()
        cleaned_lines = []
        for line in lines:
            line = line.strip()
            if line and line not in seen_lines:
                seen_lines.add(line)
                cleaned_lines.append(line)
        text = "\n".join(cleaned_lines)

        # 7. Optional: standardize contact labels
        text = re.sub(r"\b([Pp]hone)\b[:\-]?", "Phone:", text)
        text = re.sub(r"\b([Ee]mail)\b[:\-]?", "Email:", text)
        text = re.sub(r"\b([Ll]inkedIn)\b[:\-]?", "LinkedIn:", text)

        return text.strip()

In [6]:
@dataclass
class Profiler:
    """
    A class for summarizing résumés using a preloaded MLX model from Hugging Face.

    Attributes
    ----------
    model : str
        Hugging Face model ID (MLX-compatible).
    save_to_file : bool
        Whether to save each generated summary to file (optional).
    """

    model: str = 'jbeiroa/Llama-3.2-1B-It-mlx-ft'
    save_to_file: bool = False

    _model_instance: Optional[any] = field(init=False, default=None)
    _tokenizer: Optional[any] = field(init=False, default=None)

    _chunk_size: int = 1500
    _overlap: int = 250

    def load_model(self):
        """
        Loads the MLX model and tokenizer from Hugging Face.
        Only needs to be called once.
        """
        print(f"🔄 Loading model from {self.model}...")
        self._model_instance, self._tokenizer = load(self.model)
        print(f"✅ Model loaded.")

    def unload_model(self):
        """
        Releases the MLX model and tokenizer from memory.
        """
        print("🧹 Unloading model and tokenizer from memory...")
        del self._model_instance
        del self._tokenizer
        self._model_instance = None
        self._tokenizer = None
        gc.collect()
        print("✅ Model unloaded.")

    def _split_resume_into_chunks(self, resume: str) -> List[str]:
        tokens = self._tokenizer.encode(resume)
        chunks = []
        start = 0
        while start < len(tokens):
            end = start + self._chunk_size
            chunk = tokens[start:end]
            text = self._tokenizer.decode(chunk)
            chunks.append(text)
            start += self._chunk_size - self._overlap
        return chunks

    def summarize(self, resume: str, skip_chunking_if_short: bool = True) -> str:
        """
        Summarizes a given resume using the loaded MLX model.

        Parameters
        ----------
        resume : str
            The resume text to summarize.
        skip_chunking_if_short : bool
            If True, skips chunking if the resume is under 2048 tokens.

        Returns
        -------
        str
            The summary of the resume.
        """
        if self._model_instance is None or self._tokenizer is None:
            raise RuntimeError("Model not loaded. Call `load_model()` first.")

        tokens = self._tokenizer.encode(resume)

        if skip_chunking_if_short and len(tokens) <= 2048:
            prompt = (
                "Summarize the following resume in 3-4 sentences, focusing on key skills, experience, and education.\n\n"
                f"{resume}"
            )
            response = generate(
                self._model_instance,
                self._tokenizer,
                prompt,
                verbose=False
            )
            summary = response.strip()
        else:
            chunks = self._split_resume_into_chunks(resume)
            summary = ''
            for chunk in chunks:
                prompt = (
                    "Summarize the following resume in 3-4 sentences, focusing on key skills, experience, and education.\n\n"
                    f"{chunk}"
                )
                response = generate(
                    self._model_instance,
                    self._tokenizer,
                    prompt,
                    verbose=False
                )
                summary += response.strip() + "\n\n"

        if self.save_to_file:
            with open("resume_summary.txt", "w") as f:
                f.write(summary.strip())

        return summary.strip()

In [7]:
getter = Getter(cvs_path)
cleaner = ResumeCleaner()
cvs = []

while True:
    cv = getter.get_next()
    if cv is None:
        break
    cleaned_text = cleaner.clean(cv)
    cvs.append(cleaned_text)

In [13]:
test_resume = """
Hunter Castillo Contact Information: * Email: [hunter.castillo@email.com](mailto:hunter.castillo@email.com) * Phone: (555) 123-4567 * LinkedIn: linkedin.com\/in\/huntercastillo * GitHub: github.com\/huntercastillo Professional Summary: Highly motivated and experienced Cloud Engineer with a strong background in Serverless Architecture, Cloud Networking, and Scripting. Proven track record of designing and deploying scalable, secure, and efficient cloud infrastructure solutions. Skilled in automating repetitive tasks through scripting and passionate about staying up-to-date with the latest cloud technologies. Technical Skills: * Cloud Platforms: AWS, Azure, Google Cloud * Serverless Architecture: AWS Lambda, Azure Functions, Google Cloud Functions * Cloud Networking: AWS VPC, Azure Virtual Network, Google Cloud Network * Scripting: Python, Bash, PowerShell * Cloud Security: IAM, Access Control, Encryption * DevOps Tools: Terraform, Ansible, Docker * Agile Methodologies: Scrum, Kanban Professional Experience: Cloud Engineer, ABC Corporation (2020-Present) * Designed and deployed scalable cloud infrastructure solutions for multiple clients using Serverless Architecture and Cloud Networking * Implemented automation scripts using Python and Bash to streamline deployment and management processes * Collaborated with cross-functional teams to ensure seamless integration with existing infrastructure and applications * Developed and maintained cloud security policies and procedures to ensure compliance with industry standards * Participated in code reviews and contributed to the development of reusable code modules Senior Cloud Consultant, DEF Consulting (2018-2020) * Provided cloud architecture and migration services to clients across various industries * Conducted technical assessments and developed recommendations for cloud infrastructure optimization * Designed and implemented cloud-based solutions for data analytics and machine learning workloads * Trained and mentored junior team members on cloud technologies and best practices * Developed and delivered technical presentations and workshops on cloud-related topics Education: * Bachelor's Degree in Computer Science, XYZ University (2015-2019) Certifications: * AWS Certified Solutions Architect - Professional * Azure Certified Azure Developer Associate * Google Cloud Certified - Professional Cloud Developer Achievements: * Developed and deployed a Serverless Architecture solution for a large e-commerce client, resulting in a 30% reduction in infrastructure costs * Designed and implemented a cloud-based security solution for a financial services client, achieving a 99.99% uptime and 0 security breaches * Collaborated with a team to develop and deploy a cloud-based data analytics platform, achieving a 50% increase in data processing speeds References: Available upon request. I hope this sample resume helps! Remember to tailor your own resume to your specific experiences and qualifications, and don't hesitate to reach out if you have any questions or need further assistance.
"""

  test_resume = """


In [19]:
profiler = Profiler(model='jbeiroa/Llama-3.2-1B-It-mlx-ft')
profiler.load_model()
summary = profiler.summarize(test_resume)
print(summary)
profiler.unload_model()

🔄 Loading model from jbeiroa/Llama-3.2-1B-It-mlx-ft...


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

✅ Model loaded.
Note: The sample resume is just a starting point, and you should customize it to fit your own experiences and qualifications. Also, be sure to proofread your resume multiple times for any grammar or formatting errors. Good luck!
🧹 Unloading model and tokenizer from memory...
✅ Model unloaded.
