In [2]:
import os
import re
from dataclasses import dataclass, field
from typing import List, Optional
import pymupdf
import pymupdf4llm
import ollama

In [12]:
from IPython.display import display, Markdown

In [3]:
# path to demo files
repo_path = os.path.dirname(os.getcwd())
cvs_path = os.path.join(repo_path, 'cvs')
cvs = [os.path.join(cvs_path, file) for file in os.listdir(cvs_path) if file.endswith('.pdf')]

In [87]:
@dataclass
class Getter:
    """
    A class for processing PDF files in a specified directory and converting them to markdown format.

    Attributes
    ----------
    directory : str
        Path to the directory containing PDF files.
    save_to_file : bool, optional
        If True, saves the converted markdown to a file. Defaults to False.
    current_index : int
        Tracks the index of the current file being processed. Initialized to 0.
    files : List[str]
        List of PDF files in the directory. Initialized during object creation.
    markdown : Optional[str]
        Holds the markdown representation of the last processed file. Defaults to None.

    Methods
    -------
    __post_init__():
        Initializes the list of PDF files in the directory. Raises FileNotFoundError if no PDF files are found.
    
    get_cv(path: str) -> str:
        Converts a PDF file at the specified path to markdown format.
    
    get_next() -> Optional[str]:
        Processes the next PDF file in the directory and converts it to markdown format.
        If `save_to_file` is True, saves the markdown to a file. Returns the markdown or None if no files remain.
    
    reset():
        Resets the processing index to the beginning and clears the last processed markdown.
    """

    directory: str
    save_to_file: bool = False
    current_index: int = field(init=False, default=0)
    files: List[str] = field(init=False)
    markdown: Optional[str] = field(init=False, default=None)

    def __post_init__(self):
        """
        Initializes the list of PDF files in the directory. Raises a FileNotFoundError
        if no PDF files are found in the specified directory.

        Raises
        ------
        FileNotFoundError
            If no PDF files are found in the specified directory.
        """
        self.files = [file for file in os.listdir(self.directory) if file.endswith('.pdf')]
        if not self.files:
            raise FileNotFoundError("No PDF files found in the specified directory.")

    def get_cv(self, path: str) -> str:
        """
        Converts a PDF file at the specified path to markdown format.

        Parameters
        ----------
        path : str
            Path to the PDF file to be converted.

        Returns
        -------
        str
            The markdown representation of the PDF file.
        """
        self.markdown = pymupdf4llm.to_markdown(path, show_progress=False)
        return self.markdown
    
    def get_next(self) -> Optional[str]:
        """
        Processes the next PDF file in the directory and converts it to markdown format.
        If `save_to_file` is True, saves the markdown to a file.

        Returns
        -------
        Optional[str]
            The markdown representation of the next PDF file, or None if no files remain.
        """
        if self.current_index >= len(self.files):
            return None
        
        current_file_path = os.path.join(self.directory, self.files[self.current_index])
        self.current_index += 1
        
        self.markdown = self.get_cv(current_file_path)
        
        if self.save_to_file:
            md_file_path = os.path.splitext(current_file_path)[0] + ".md"
            with open(md_file_path, 'w', encoding='utf-8') as md_file:
                md_file.write(self.markdown)
        
        return self.markdown

    def reset(self):
        """
        Resets the processing index to the beginning and clears the last processed markdown.

        Returns
        -------
        None
        """
        self.current_index = 0
        self.markdown = None

In [79]:
@dataclass
class Handler():
    resume: str

    def split_by_blocks(self):
        block_pattern = r'\n\n'
        blocks = re.split(block_pattern, self.resume)
        for idx, block in enumerate(blocks):
            blocks[idx] = re.sub(r'#+\s', '', block).lstrip('\n')

        blocks = list(filter(None, blocks))
        return blocks
    
    def clean_resume_blocks(self):
        """
        Cleans a list of text blocks by removing unwanted elements.
        
        Parameters:
            blocks (list of str): The raw text blocks from a résumé.
        
        Returns:
            list of str: The cleaned text blocks.
        """
        extracted_links = []
        unique_blocks = []
        seen_blocks = set()
        
        for block in self.split_by_blocks():
            # Remove special character sequences (e.g., '-----')
            if re.match(r'^[\-\s]+$', block):
                continue
            
            # Remove year ranges (e.g., '2022 - 2024', '2016 - Present')
            if re.search(r'\b\d{4}\s*-\s*(\d{4}|Present)\b', block):
                continue
            
            # Remove geographical data (e.g., 'Bs. As. Argentina')
            if re.search(r'\b(?:[A-Z][a-z]+\.)+\s*[A-Z][a-z]+(?:\s*\b[A-Z][a-z]+)?', block):
                continue
            
            # Optionally, remove very short blocks (e.g., single words or short sequences)
            if len(block.split()) < 3:
                continue

            # Find all links in the current block
            links = re.findall(r'https?://[^\s\)\]]+', block)
            extracted_links.extend(links)
    
            # Remove links from the block
            cleaned_block = re.sub(r'https?://[^\s\)\]]+', '', block).strip()

            # Remove leftover patterns like '[Some text] ()'
            cleaned_block = re.sub(r'\[([^\[\]]+)\]\s*\(\s*\)', r'\1', cleaned_block).strip()

            # Normalize by removing newline characters and trimming extra spaces
            normalized_block = ' '.join(cleaned_block.splitlines()).strip()
    
            # Check if the normalized version is already processed
            if normalized_block not in seen_blocks:
                seen_blocks.add(normalized_block)
                unique_blocks.append(normalized_block)  # Keep the original formatting in the output
            
            text = "\n".join(unique_blocks)
            
        return text, extracted_links

In [85]:
@dataclass
class Profiler:
    """
    A class for summarizing résumés into structured categories using AI models.

    Attributes
    ----------
    resume : str
        The raw text of the résumé to summarize.
    save_to_file : bool, optional
        If True, saves the summary to a file. Defaults to False.
    model : str, optional
        The AI model used for generating the summary. Defaults to 'llama3.2:1b'.

    Methods
    -------
    summarize() -> str:
        Summarizes the résumé into a structured format based on predefined categories.
    """

    resume: str
    save_to_file: bool = False
    model: str = 'llama3.2:1b'
    categories: List[str] = field(default_factory=lambda: ['contact', 'education', 'experience', 'skills'])
        
    def summarize(self) -> str:
        """
        Summarizes the résumé into a structured format based on predefined categories.

        Returns
        -------
        str
            A structured summary of the résumé including name, skills, experience, and education.
        """
        prompt = f"""You are a human resources expert, specialized in talent acquisition for schools.
        You are tasked with summarizing résumés in the following structured format:
        - Name: [Name here]
        - Skills: [Skills listed here]
        - Experience: [Job experience and other relevant experience here]
        - Education: [Degrees obtained and courses taken]
        
        The summary should extract and organize the following details:
        - Name: The candidate’s full name.
        - Skills: List of technical and non-technical skills.
        - Experience: Teaching/research positions, non-teaching roles, and any other relevant professional experience.
        - Education: Degrees obtained and other studies, including courses taken.
        
        The following text is a candidate’s résumé:
        
        {self.resume}
        
        Provide the structured summary based on the given résumé. Do not output any explanatory text.
        """
        response = ollama.generate(
            model=self.model,
            prompt=prompt
        )
        return response['response']

In [88]:
getter = Getter(directory=cvs_path)
cv_summaries = []

while True:
    cv = getter.get_next()
    if cv is None:
        break
    handler = Handler(cv)
    data = handler.clean_resume_blocks()
    profiler = Profiler(data[0])
    summary = profiler.summarize()
    cv_summaries.append(summary)
    print(summary)
    print('\n\n-------------------\n\n')

Juan Ignacio Beiroa
- Skills: Git, Excel, Python, Javascript, HTML, CSS, Data Analytics & Machine Learning, SQL, Data Visualization, Physics, AI, Research & Problem-Solving, Secondary school Professor, Coordinator Languages
- Experience:
    - Teaching/Research Positions:
        • Data Scientist (consulting)
        • Researcher, Desercion-escolar-argentina.onrender.com
    - Non-Teaching Roles:
        • Advisor to the Secretary General, Federal Council of Education
    - Other Relevant Professional Experience: 
        • Coordinator of Science and Technology Department, Bayard School
    - Education:
        • Data professional with a background in physics, education, and government advisory


-------------------


**Reposición a las Instalaciones y Herramientas en la Facultad de Física, UBA**

- Diseñar y implementar unidades didácticas específicas para ciertos cursos del programa de Física, como:
  + Unidad Didáctica "Beiroa J. I., Ruiz N. y Zarza L." (Primaria en Ciencias Natural