In [3]:
"""
Resume Parser using AWS Bedrock Claude with Pydantic Validation
Supports both PDF files and plain text input
"""

import json
import boto3
import pdfplumber
from pathlib import Path
from pydantic import BaseModel, Field, EmailStr, ValidationError
from typing import List, Optional, Literal, Union
import instructor

# =============================================================================
# PYDANTIC MODELS FOR RESUME STRUCTURE
# =============================================================================

class Education(BaseModel):
    """Education entry in resume"""
    degree: str = Field(min_length=1, description="Degree or certification name")
    institution: str = Field(min_length=1, description="School/University name")
    field_of_study: Optional[str] = None
    graduation_year: Optional[int] = Field(None, ge=1950, le=2030)
    gpa: Optional[float] = Field(None, ge=0.0, le=4.0)
    location: Optional[str] = None

class WorkExperience(BaseModel):
    """Work experience entry"""
    job_title: str = Field(min_length=1, description="Job title/position")
    company: str = Field(min_length=1, description="Company name")
    location: Optional[str] = None
    start_date: Optional[str] = Field(None, description="Start date (e.g., 'Jan 2020' or '2020-01')")
    end_date: Optional[str] = Field(None, description="End date or 'Present'")
    duration: Optional[str] = None
    responsibilities: List[str] = Field(default_factory=list, description="Key responsibilities and achievements")
    
class Skill(BaseModel):
    """Skill with optional proficiency level"""
    name: str = Field(min_length=1)
    category: Optional[Literal["technical", "soft", "language", "tool", "framework", "other"]] = None
    proficiency: Optional[Literal["beginner", "intermediate", "advanced", "expert"]] = None

class Certification(BaseModel):
    """Professional certification"""
    name: str = Field(min_length=1)
    issuing_organization: Optional[str] = None
    issue_date: Optional[str] = None
    expiry_date: Optional[str] = None
    credential_id: Optional[str] = None

class Project(BaseModel):
    """Project or portfolio item"""
    title: str = Field(min_length=1)
    description: str = Field(min_length=1)
    technologies: List[str] = Field(default_factory=list)
    url: Optional[str] = None
    date: Optional[str] = None

class ParsedResume(BaseModel):
    """Complete structured resume data"""
    # Personal Information
    full_name: str = Field(min_length=1, description="Candidate's full name")
    email: Optional[EmailStr] = None
    phone: Optional[str] = None
    location: Optional[str] = Field(None, description="City, State/Country")
    linkedin_url: Optional[str] = None
    github_url: Optional[str] = None
    portfolio_url: Optional[str] = None
    
    # Professional Summary
    summary: Optional[str] = Field(None, description="Professional summary or objective")
    
    # Experience and Education
    work_experience: List[WorkExperience] = Field(default_factory=list)
    education: List[Education] = Field(default_factory=list)
    
    # Skills and Certifications
    skills: List[Skill] = Field(default_factory=list)
    certifications: List[Certification] = Field(default_factory=list)
    
    # Additional
    projects: List[Project] = Field(default_factory=list)
    languages: List[str] = Field(default_factory=list, description="Spoken languages")
    
    # Metadata
    years_of_experience: Optional[int] = Field(None, ge=0, le=50)
    current_job_title: Optional[str] = None


# =============================================================================
# AWS BEDROCK CLAUDE CLIENT
# =============================================================================

class BedrockResumeParser:
    """Resume parser using AWS Bedrock Claude with tool-based structured output and Pydantic validation"""
    
    def __init__(self, region_name: str = "eu-west-2", model_id: str = "anthropic.claude-3-7-sonnet-20250219-v1:0"):
        """
        Initialize Bedrock client
        
        Args:
            region_name: AWS region where Bedrock is available
            model_id: Bedrock model ID to use
        """
        bedrock_client = boto3.client(
            service_name='bedrock-runtime',
            region_name=region_name
        )

        # Wrap Bedrock client with Instructor using Claude tool mode
        self.client = instructor.from_bedrock(
            client=bedrock_client,
            mode=instructor.Mode.BEDROCK_TOOLS,
        )

        self.model_id = model_id
        
    
    def extract_text_from_pdf(self, pdf_path: Union[str, Path]) -> str:
        """
        Extract text from PDF file using pdfplumber
        
        Args:
            pdf_path: Path to PDF file
            
        Returns:
            Extracted text from all pages
        """
        print(f"üìÑ Extracting text from PDF: {pdf_path}")
        
        try:
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                print(f"   Total pages: {len(pdf.pages)}")
                for i, page in enumerate(pdf.pages, 1):
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                        print(f"   ‚úì Extracted page {i}")
                    else:
                        print(f"   ‚ö† Page {i} has no extractable text")
            
            print(f"‚úÖ Extracted {len(text)} characters from PDF")
            return text.strip()
            
        except Exception as e:
            print(f"‚ùå Error extracting text from PDF: {e}")
            raise
    
    def _create_prompt(self, resume_text: str) -> str:
        """
        Create structured prompt with JSON schema
        
        Args:
            resume_text: Raw resume text
            
        Returns:
            Formatted prompt with schema
        """
        schema = json.dumps(ParsedResume.model_json_schema(), indent=2)
        
        prompt = f"""Extract information from this resume and return as JSON.

IMPORTANT:
1. Be CONCISE - use short descriptions and summaries
2. For responsibilities, extract only key points (max 3-5 per job)
3. Combine similar skills into categories
4. Return ONLY valid JSON - no markdown, no preamble
5. Ensure the JSON is complete and properly closed

JSON SCHEMA:
{schema}

RESUME TEXT:
{resume_text}

Return the complete JSON object:"""
        
        return prompt
    
    def parse_resume(self, pdf_path: Union[str, Path], max_retries: int = 2) -> tuple[Optional[ParsedResume], Optional[str]]:
        """
        Parse resume from PDF file only
        
        Args:
            pdf_path: Path to PDF file
            max_retries: Number of retry attempts on validation failure
            
        Returns:
            Tuple of (ParsedResume object, error_message)
            - On success: (ParsedResume, None)
            - On failure: (None, error_message)
        """
        # Validate input is a PDF file
        path = Path(pdf_path)
        
        if not path.exists():
            return None, f"File not found: {pdf_path}"
        
        if not path.is_file():
            return None, f"Path is not a file: {pdf_path}"
        
        if path.suffix.lower() != '.pdf':
            return None, f"Only PDF files are supported. Got: {path.suffix}"
        
        # Extract text from PDF
        try:
            resume_text = self.extract_text_from_pdf(path)
        except Exception as e:
            return None, f"Failed to extract text from PDF: {str(e)}"
        
        if not resume_text or len(resume_text.strip()) < 10:
            return None, "Extracted text is empty or too short. PDF may be image-based or corrupted."
        
        # Now parse the extracted text
        prompt = self._create_prompt(resume_text)
        
        try:
            parsed_resume: ParsedResume = self.client.create(
            model=self.model_id,
            response_model=ParsedResume,
            messages=[
                    {
                        "role": "system",
                        "content": (
                            "You extract structured resume data using tool calls. "
                            "Always follow the provided instructions."
                        ),
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    },
                ],
            )
            
            print(f"üìä Extracted: {parsed_resume.full_name}")
            print(f"   - Education entries: {len(parsed_resume.education)}")
            print(f"   - Work experiences: {len(parsed_resume.work_experience)}")
            print(f"   - Skills: {len(parsed_resume.skills)}")
            print(f"   - Certifications: {len(parsed_resume.certifications)}")
            
            return parsed_resume, None
            
        except ValidationError as e:
            # Tool output did not satisfy schema (rare, but possible)
            return None, f"Schema validation failed: {str(e)}"

        except Exception as e:
        # Bedrock / Instructor / runtime error
            return None, f"Parsing failed: {str(e)}"


# =============================================================================
# EXAMPLE USAGE
# =============================================================================

def main():
    """Example usage of the resume parser"""
    
    # Initialize parser with your region and model
    parser = BedrockResumeParser(
        region_name="eu-west-2",
        model_id="anthropic.claude-3-7-sonnet-20250219-v1:0"
    )
    
    print("=" * 80)
    print("RESUME PARSER - PDF ONLY")
    print("=" * 80)
    
    # Parse from PDF file
    print("\n### Parsing PDF Resume ###\n")
    
    # Example: Replace with your actual PDF path
    pdf_path = "/home/ec2-user/SageMaker/resume_parser/resume.pdf"
    
    print(f"üí° To use: parser.parse_resume('{pdf_path}')")
    print("   Only PDF files are accepted.\n")
    
    # Uncomment these lines when you have a PDF file:
    parsed_resume, error = parser.parse_resume(pdf_path)
    
    if parsed_resume:
        # Get JSON output
        json_output = parsed_resume.model_dump_json(indent=2)
        
        print("\n" + "=" * 80)
        print("PARSED RESUME (JSON)")
        print("=" * 80)
        # Print in chunks to avoid truncation
        chunk_size = 1000
        for i in range(0, len(json_output), chunk_size):
            print(json_output[i:i+chunk_size])
        
        # Access structured data
        print("\n" + "=" * 80)
        print("STRUCTURED DATA ACCESS")
        print("=" * 80)
        print(f"Name: {parsed_resume.full_name}")
        print(f"Email: {parsed_resume.email}")
        print(f"Phone: {parsed_resume.phone}")
        print(f"Location: {parsed_resume.location}")
        print(f"Years of Experience: {parsed_resume.years_of_experience}")
        
        print(f"\nEducation ({len(parsed_resume.education)} entries):")
        for edu in parsed_resume.education:
            print(f"  - {edu.degree} in {edu.field_of_study or 'N/A'}")
            print(f"    {edu.institution} ({edu.graduation_year or 'N/A'})")
        
        print(f"\nWork Experience ({len(parsed_resume.work_experience)} entries):")
        for exp in parsed_resume.work_experience:
            print(f"  - {exp.job_title} at {exp.company}")
            print(f"    Duration: {exp.start_date} to {exp.end_date}")
            print(f"    Responsibilities: {len(exp.responsibilities)} items")
        
        print(f"\nSkills ({len(parsed_resume.skills)} total):")
        for skill in parsed_resume.skills[:10]:  # Show first 10
            category = f" ({skill.category})" if skill.category else ""
            print(f"  - {skill.name}{category}")
        if len(parsed_resume.skills) > 10:
            print(f"  ... and {len(parsed_resume.skills) - 10} more")
        
        print(f"\nCertifications ({len(parsed_resume.certifications)} total):")
        for cert in parsed_resume.certifications:
            org = f" - {cert.issuing_organization}" if cert.issuing_organization else ""
            date = f" ({cert.issue_date})" if cert.issue_date else ""
            print(f"  - {cert.name}{org}{date}")
    else:
        print(f"\n‚ùå Parsing failed: {error}")


if __name__ == "__main__":
    main()

RESUME PARSER - PDF ONLY

### Parsing PDF Resume ###

üí° To use: parser.parse_resume('/home/ec2-user/SageMaker/resume_parser/resume.pdf')
   Only PDF files are accepted.

üìÑ Extracting text from PDF: /home/ec2-user/SageMaker/resume_parser/resume.pdf
   Total pages: 2
   ‚úì Extracted page 1
   ‚úì Extracted page 2
‚úÖ Extracted 6948 characters from PDF
üìä Extracted: Hargurjeet Singh Ganger
   - Education entries: 3
   - Work experiences: 3
   - Skills: 28
   - Certifications: 1

PARSED RESUME (JSON)
{
  "full_name": "Hargurjeet Singh Ganger",
  "email": "gurjeet333@gmail.com",
  "phone": "+91 9035828125",
  "location": "Bangalore, India",
  "linkedin_url": "linkedin.com/in/hargurjeet/",
  "github_url": "github.com/hargurjeet",
  "portfolio_url": "gurjeet333.medium.com",
  "summary": "Experienced IT professional with 15+ years in the industry, specializing in data science, statistical analysis, machine learning and Generative AI. Expert in LLMs, AI model development. Proficient in