In [31]:
from typing import List
import json
import glob
from docx import Document
import vertexai
from vertexai.generative_models import GenerativeModel, Part

In [32]:
def build_prompt_parts(document_part: str):
    """
    Build the prompt parts as input to the model

    Args:
        document_part (str): String that has the resume data
    """
    prompt_parts = [
        "Extract the following details from this resume in JSON format:",
        "Name, Email, Skills",
        document_part
    ]
    return prompt_parts


In [33]:
def get_resume_details_from_pdf(pdf_file_path: str, model: GenerativeModel):
    """
    Processes a PDF document resume using Gemini 2.5 Flash and extracts details.

    Args:
        pdf_file_path (str): path to pdf file to be processed
        model (GenerativeModel): Generative model to be used
    """
    with open(pdf_file_path, "rb") as f:
        pdf_content = f.read()
    pdf_part = Part.from_data(data=pdf_content, mime_type="application/pdf")
    prompt_parts = build_prompt_parts(pdf_part)
    response = model.generate_content(prompt_parts)
    return response.text


In [34]:
def get_resume_details_from_doc(docx_file_path: str, model: GenerativeModel):
    """
    Processes a Word document resume using Gemini 2.5 Flash and extracts details.

    Args:
        docx_file_path (str): path to docx file to be processed
        model (GenerativeModel): Generative model to be used
    """
    document = Document(docx_file_path)
    text = ''
    for paragraph in document.paragraphs:
        text += f'{paragraph.text}\n'
    prompt_parts = build_prompt_parts(text)
    response = model.generate_content(prompt_parts)
    return response.text



In [35]:
def parse_string_to_json(response: str):
    """
    Parse the resulting string into a JSON object

    Args:
        response (str): text response from generate_content function by the generative AI model
    """
    dict_output = json.loads(response.replace('`', '').replace('\n', '')[4:])
    skills = [skill for skill in dict_output['Skills']]
    json_output = {
        'Name': dict_output['Name'],
        'Email': dict_output['Email'],
        'Skills': skills
    }
    return json_output


In [36]:
def write_json(json_file_name: str, json_output: dict):
    """
    Write the resulting JSON object into a JSON file

    Args:
        json_file_name (str): file name to be assigned to the JSON file being saved.
        json_output (dict): content of the JSON file being saved
    """
    with open(json_file_name, "w") as f:
        json.dump(json_output, f, indent=4)


In [37]:
def main(docx_paths: List[str], pdf_paths: List[str], model: GenerativeModel):
    """
    Main processing function to generate parsed pdf and docx resumes and write the results to json files

    Args:
        docx_paths (List[str]): list of paths to docx files.
        pdf_paths (List[str]): list of paths to pdf files.
        model (GenerativeModel): Generative model to be used
    """
    for docx_path in docx_paths:
        response = get_resume_details_from_doc(docx_path, model)
        json_output = parse_string_to_json(response)
        json_file_name = docx_path.replace('/data/', '/reports/').replace('.docx', '.json')
        write_json(json_file_name, json_output)
    for pdf_path in pdf_paths:
        response = get_resume_details_from_pdf(pdf_path, model)
        json_output = parse_string_to_json(response)
        json_file_name = pdf_path.replace('/data/', '/reports/').replace('.pdf', '.json')
        write_json(json_file_name, json_output)


In [38]:
# Execute the main function
if __name__ == "__main__":
    vertexai.init()
    model = GenerativeModel("gemini-2.5-flash")
    docx_paths = [path for path in glob.glob('../data/*') if '.docx' in path]
    pdf_paths = [path for path in glob.glob('../data/*') if '.pdf' in path]
    main(docx_paths, pdf_paths, model)
