In [1]:
import os
import re
from dataclasses import dataclass, field
from typing import List, Optional
import pymupdf
import pymupdf4llm
import ollama

In [2]:
# path to demo files
repo_path = os.path.dirname(os.getcwd())
cvs_path = os.path.join(repo_path, 'cvs')
cvs = [os.path.join(cvs_path, file) for file in os.listdir(cvs_path) if file.endswith('.pdf')]

In [3]:
@dataclass
class Getter:
    directory: str
    save_to_file: bool = False
    current_index: int = field(init=False, default=0)
    files: List[str] = field(init=False)
    markdown: Optional[str] = field(init=False, default=None)

    def __post_init__(self):
        # List all PDF files in the directory
        self.files = [file for file in os.listdir(self.directory) if file.endswith('.pdf')]
        if not self.files:
            raise FileNotFoundError("No PDF files found in the specified directory.")

    def get_cv(self, path: str) -> str:
        doc = pymupdf.open(path)
        self.markdown = pymupdf4llm.to_markdown(doc, show_progress=False)

        return self.markdown
    
    def get_next(self) -> Optional[str]:
        """Processes the next PDF file in the directory."""
        if self.current_index >= len(self.files):
            print("No more files to process.")
            return None
        
        current_file_path = os.path.join(self.directory, self.files[self.current_index])
        self.current_index += 1
        
        self.markdown = self.get_cv(current_file_path)
        
        if self.save_to_file:
            md_file_path = os.path.splitext(current_file_path)[0] + ".md"
            with open(md_file_path, 'w', encoding='utf-8') as md_file:
                md_file.write(self.markdown)
        
        return self.markdown

    def reset(self):
        """Resets the processing index to the beginning."""
        self.current_index = 0
        self.markdown = None

In [46]:
@dataclass
class Profiler:
    resume: str
    save_to_file: bool = False
    model: str = 'llama3.2:1b'
    categories: List = field(default_factory=lambda: ['contact', 'education', 'experience', 'skills'])

    def split_by_blocks(self):
        block_pattern = r'\.\n|\n\n'
        blocks = re.split(block_pattern, self.resume)
        for idx, block in enumerate(blocks):
            blocks[idx] = re.sub(r'#+\s', '', block).lstrip('\n')

        blocks = list(filter(None, blocks))
        return blocks
            

    def get_name(self):
        prompt = f"""The following text is an excerpt from a job candidate's résumé for a teaching position.
        ---
        {self.resume}
        ---
        Extract the candidate's full name. Return only the name as your answer, formated in sentence case and without any other information nor special characters such as punctuation symbols. 
        """
        response = ollama.generate(
            model=self.model,
            prompt=prompt
        )
        return response['response']
        
    def summarize(self):
        prompt = f"""You are a human resources expert, specialized in talent acquisition for schools. 
        You are tasked with summarizing résumés highlighting the following aspects:
            - Contact Information (phone number, email address and social network handles/personal web pages).
            - Education (degrees obtained and other studies such as courses made).
            - Job Experience (teaching/research positions, non teaching jobs, etc.).
            - Skills (technical or others).
        The following text is a candidate's résumé.
        ---
        {self.resume}
        ---
        Write a 100 to 150 words summary of the résumé given.
        """
        response = ollama.generate(
            model=self.model,
            prompt=prompt
        )
        return response['response']

In [47]:
getter = Getter(directory=cvs_path)
md = getter.get_cv(cvs[2])
profiler = Profiler(md)
print(profiler.get_name())
print(profiler.summarize())

Juan Beiroa.
Juan Beiroa's résumé highlights his extensive experience in teaching and research positions. He has taught at various institutions, including Colegio Bayard Coordinador de Ciencias Exactas y Tecnología, Universidad de Buenos Aires XXI, and Ministerio de Educación de la Nación Asesor de políticas educativas. Beiroa's academic background includes a degree in physics from the University of Buenos Aires and a specialization in education policy at the Universidad Pedagógica Nacional.

He has also held various roles, including Ayudante de 1ª del materia Física e Introducción a la Biofísica at UBA XXI and Asesor de la Secretaría General del Consejo Federal de Educación. Additionally, Beiroa has worked in the fields of data science and technology, including as a Tecnico at Grupo de investigación orientado al estudio de propiedades termomecánicas en CuAlNi.

Throughout his career, Beiroa has demonstrated expertise in producing reports on salary analysis, facilitating negotiations w

In [48]:
while True:
    cv = getter.get_next()
    if cv is None:
        break
    profiler = Profiler(cv)
    summary = profiler.summarize()
    print(summary)
    print('\n\n-------------------\n\n')



-------------------


El recenso que se presentó revela una experiencia única en el ámbito educativo. Durante su vida, ha dedicado sus esfuerzos y energías a contribuir al desarrollo de contenidos, cursos y talleres para docentes sobre tecnología educativa. También han participado activamente en la design y dictado de actividades como un curso de primeros auxilios y RCP DEA.

En el ámbito específico de enseñanza media y superior, ha ocupado cargos relevantes como Representante del Claustro de Estudiantes. Además, ha sido Expositor en varias instancias académicas importantes, como la Estación Fluidos y Electromagnetismo, donde se han expuesto conceptos complejos a través de experimentos simples.

La experiencia de enseñanza no es el único campo al que ha dedicado su tiempo, ya que también ha participado en actividades de divulgación y popularización científica como la Estación Fluidos y Electromagnetismo.


-------------------


Juan Beiroa is a dedicated physics educator with an impr