In [1]:
# I choose to use notebook because I think this task is relatively straightforward. 
# There are five objectives in this assignments:
#  1. PDF Parsing: this task is straightforward, as using a AI tool to parsing is unnecessary. I would just use a python library.
#  2. Field Identification: This task is very suitable for a language model, this will be our first agent.
#  3. Consistent Output: This can be done with program or a language model, it could be the second agent.
#  4. Data Storage: Use a few python code to store the file is sufficent.
#  5. (optional) Analysis the CVs: 

In [None]:
# dependencies
import fitz # PyMuPDF parsing tool
from smolagents import OpenAIServerModel, CodeAgent, DuckDuckGoSearchTool, AssistantAgent

import os
import json
from dotenv import load_dotenv

ImportError: cannot import name 'AssistantAgent' from 'smolagents' (C:\Users\juhen\AppData\Roaming\Python\Python312\site-packages\smolagents\__init__.py)

In [12]:
import smolagents
print(smolagents.__version__)

1.13.0


In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

raw_text = extract_text_from_pdf("example.pdf")

In [5]:
''' The following code calls the API and run the prompts'''


load_dotenv()

model = OpenAIServerModel(
    model_id="gpt-4o",
    api_base="https://api.openai.com/v1", # Leave this blank to query OpenAI servers.
    api_key=os.environ["OPENAI_API_KEY"], # Switch to the API key for the server you're targeting.
)
agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)


In [31]:
def extract_basic_info(text, agent):
    prompt = f"""
You are a professional resume parsing assistant.  
Your job is to read raw resume text and extract **only** the following information as JSON:

{{
  "name": "",         // Full name of the person
  "email": "",        // Email address
  "phone": "",        // Phone number
  "location": ""      // City or general location, like "San Francisco, CA"
}}

⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer
Resume:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)

In [32]:
def extract_work_experience(text, agent):
    prompt = f"""
You are an AI assistant tasked with extracting work experience details from resumes. 
Please extract the following information for each position listed in the resume:

- Position Title
- Company Name
- Start Date
- End Date (or indicate if currently employed)
- Responsibilities (a brief description)

Return the information in the following JSON format:

[
  {{
    "position": "Position Title",
    "company": "Company Name",
    "start_date": "Start Date",
    "end_date": "End Date",
    "responsibilities": "Responsibilities"
  }},
  ...
]

⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer

Resume Text:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)

In [33]:
def extract_education(text, agent):
    prompt = f"""
You are a resume parsing assistant. Extract education history from the given resume text.

For each entry, return the following fields:
- degree: e.g. "B.S. in Computer Science", "Master of Science"
- institution: e.g. "University of California, San Diego"
- start_date: e.g. "2019-09"
- end_date: e.g. "2023-06" (or "Present" if still studying)

Return the result as a **JSON list**, like this:

[
  {{
    "degree": "B.S. in Computer Science",
    "institution": "University of California, San Diego",
    "start_date": "2019-09",
    "end_date": "2023-06"
  }},
  ...
]

⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer

Resume text:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)


In [27]:
def extract_project_experience(text, agent):
    prompt = f"""
You are a resume parser AI.

Extract project experience from the resume text below. For each project, provide the following:

- project_name: Name or title of the project
- description: 1–2 sentence summary of what the project does
- tech_stack: A list of technologies/tools used in the project (e.g. Python, React, Docker)
- start_date: If available (format like "2023-01")
- end_date: If available (format like "2023-06" or "Present")

Return only JSON in this format:

[
  {{
    "project_name": "Your Portfolio Website",
    "description": "A personal website showcasing my projects and skills, with animations and dynamic layout.",
    "tech_stack": ["Jekyll", "CSS3", "GitHub Pages", "Bootstrap"],
    "start_date": "2023-01",
    "end_date": "2023-03"
  }},
  ...
]

⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer

Resume:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)

In [26]:
def extract_skills_and_other(text, agent):
    prompt = f"""
You are an AI resume analysis agent.

From the resume text below, extract two kinds of information:

1. **keywords**: All **skills**, **tools**, **technologies**, **languages**, and **domain-specific keywords** mentioned anywhere in the resume. Return them as a list of lowercase strings, no duplicates.

2. **other**: Any **additional relevant information** from the resume that doesn't fall under work experience, education, or projects. This may include:
   - Awards and recognitions
   - Publications
   - Volunteer experience
   - Leadership roles
   - Extracurricular activities
   - Personal statements or highlights

Return ONLY valid JSON in the following format:

{{
  "keywords": ["python", "react", "docker", "machine learning", ...],
  "other": [
    "Dean's List 2021–2023",
    "Volunteer at Code for Good",
    "Published in SIGGRAPH Asia 2023",
    ...
  ]
}}

⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer

Resume text:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)

In [None]:
def analyze_resume_commentary(text, agent):
    prompt = f"""
You are an experienced recruiter or HR reviewer.

Please read the following resume text and provide a **short professional critique** from a hiring perspective. Your comment should be useful and constructive.

Include:
- Overall impression (professional? clear? relevant?)
- What’s done well (e.g. layout, projects, clarity)
- What could be improved (e.g. vague descriptions, formatting, lack of numbers)

Keep it short and objective (2–4 sentences max).

Return only JSON like this:
{{
  "sentiment": "Your short HR-style review goes here."
}}
⚠️ Rules:
- Return a valid JSON object, with double quotes.
- If something is not present in the resume, use an empty string for that field.
- Do NOT write any explanations or code. Just return the python code that generate the information in JSON.
- Do NOT import json
- Read the resume **as a human would** — focus on actual identity info (not company names or project titles).
- You are **not allowed to write regular expressions**.
- You have to use final_answer() function to return the final answer
Resume text:
\"\"\"
{text}
\"\"\"
"""
    return agent.run(prompt)

In [28]:
def assert_field_consistency(field_name, item_list, required_keys):
    assert isinstance(item_list, list), f"'{field_name}' must be a list."
    for idx, item in enumerate(item_list):
        assert isinstance(item, dict), f"Item {idx} in '{field_name}' must be a dict."
        for key in required_keys:
            assert key in item, f"Missing required key '{key}' in '{field_name}' item {idx}."
        for key in item:
            assert key in required_keys, f"Unexpected key '{key}' in '{field_name}' item {idx}."

In [44]:
def CV_analysis(pdf_path, agent):
    
    # === Extract text ===
    text = extract_text_from_pdf(pdf_path)
    
    # === Agent Parsing ===
    basic_info = extract_basic_info(text, agent)
    work_experience = extract_work_experience(text, agent)
    education_info = extract_education(text, agent)
    project_experience = extract_project_experience(text, agent)
    keywords_and_other = extract_skills_and_other(text, agent)
    sentiment_comments = analyze_resume_commentary(text, agent)
    
    # === Assertions ===
    assert_field_consistency("work_experience", work_experience, ["position", "company", "start_date", "end_date", "responsibilities"])
    assert_field_consistency("education", education_info, ["degree", "institution", "start_date", "end_date"])
    assert_field_consistency("project_experience", project_experience, ["project_name", "description", "tech_stack", "start_date", "end_date"])
    assert isinstance(keywords_and_other.get("keywords", []), list), "'keywords' must be a list"
    assert isinstance(keywords_and_other.get("other", []), list), "'other' must be a list"

    # === Combine Everything ===
    final_resume_json = {
        **basic_info,
        "work_experience": work_experience,
        "education": education_info,
        "project_experience": project_experience,
        "keywords": keywords_and_other["keywords"],
        "other": keywords_and_other["other"],
        "sentiment": sentiment_comments
    }

    return final_resume_json

In [46]:
# Test on robustness
def batch_test_resumes(agent, input_folder="test", output_folder="output"):
    os.makedirs(output_folder, exist_ok=True)

    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)
        output_path = os.path.join(output_folder, os.path.splitext(pdf_file)[0] + ".json")

        print(f"🚀 Analyzing: {pdf_path}")
        try:
            result = CV_analysis(pdf_path, agent)
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=2)
            print(f"✅ Saved to: {output_path}")
        except Exception as e:
            print(f"❌ Failed to analyze {pdf_file}: {e}")


In [47]:
batch_test_resumes(agent)

🚀 Analyzing: test\New-York-Resume-Template-Creative.pdf


✅ Saved to: output\New-York-Resume-Template-Creative.json
🚀 Analyzing: test\Stockholm-Resume-Template-Simple.pdf


✅ Saved to: output\Stockholm-Resume-Template-Simple.json
🚀 Analyzing: test\Sydney-Resume-Template-Modern.pdf


✅ Saved to: output\Sydney-Resume-Template-Modern.json
🚀 Analyzing: test\Vienna-Modern-Resume-Template.pdf


✅ Saved to: output\Vienna-Modern-Resume-Template.json
