## working for resume

In [21]:
!pip install -qU "langchain[groq]"
!pip install tiktoken PyMuPDF docx2txt pandas markdown pydantic
!pip install langchain-text-splitters

print("All packages installed successfully!")


[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


All packages installed successfully!



[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:

import os
import json
import tiktoken
import mimetypes
from typing import Dict, List, Any, Optional, Union
from datetime import datetime
import logging
from pathlib import Path

import fitz  # PyMuPDF
import docx2txt
import pandas as pd
import markdown

from langchain_core.tools import tool
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers.openai_tools import PydanticToolsParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chat_models import init_chat_model
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pydantic import BaseModel, Field, ValidationError

print("All libraries imported successfully!")


All libraries imported successfully!


In [48]:

GROQ_API_KEY = "groq_api_key"
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

print("Groq API key configured!")

Groq API key configured!


In [38]:

def identify_file_type(file_path):
    """Identify file type based on extension."""
    if file_path.endswith(".pdf"):
        return "pdf"
    elif file_path.endswith(".docx"):
        return "docx"
    elif file_path.endswith(".csv"):
        return "csv"
    elif file_path.endswith(".md"):
        return "md"
    elif file_path.endswith(".txt"):
        return "txt"
    elif file_path.endswith(".json"):
        return "json"
    else:
        raise ValueError("Unsupported file type")

def parse_file(file_path, file_type):
    """Parse various file types and extract text content."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    if file_type == "pdf":
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text.strip()

    elif file_type == "docx":
        return docx2txt.process(file_path).strip()

    elif file_type == "csv":
        df = pd.read_csv(file_path)
        return df.to_string(index=False)

    elif file_type == "md":
        with open(file_path, "r", encoding="utf-8") as f:
            return markdown.markdown(f.read())

    elif file_type == "txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read().strip()

    elif file_type == "json":
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            return json.dumps(data, indent=2)

    else:
        raise ValueError(f"Unsupported file type: {file_type}")

print("File processing functions defined!")


File processing functions defined!


In [39]:
def count_tokens(text, model_name="gpt-3.5-turbo"):
    """Count tokens in text using tiktoken."""
    try:
        enc = tiktoken.encoding_for_model(model_name)
        return len(enc.encode(text))
    except:
        # Fallback estimation
        return int(len(text.split()) * 1.3)

def chunk_text(text, chunk_size=15000, chunk_overlap=500):
    """Split text into manageable chunks."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        length_function=count_tokens
    )
    return splitter.split_text(text)

print("Token counting and chunking functions defined!")


Token counting and chunking functions defined!


In [40]:
RESUME_SCHEMA = {
    "$schema": "http://json-schema.org/draft-04/schema#",
    "additionalProperties": False,
    "definitions": {
        "iso8601": {
            "type": "string",
            "description": "e.g. 2014-06-29",
            "pattern": "^([1-2][0-9]{3}-[0-1][0-9]-[0-3][0-9]|[1-2][0-9]{3}-[0-1][0-9]|[1-2][0-9]{3})$"
        }
    },
    "properties": {
        "basics": {
            "type": "object",
            "additionalProperties": True,
            "properties": {
                "name": {"type": "string"},
                "label": {"type": "string", "description": "e.g. Web Developer"},
                "image": {"type": "string", "description": "URL to image"},
                "email": {"type": "string", "format": "email"},
                "phone": {"type": "string"},
                "url": {"type": "string", "format": "uri"},
                "summary": {"type": "string", "description": "Write a short 2-3 sentence biography"},
                "location": {
                    "type": "object",
                    "properties": {
                        "address": {"type": "string"},
                        "postalCode": {"type": "string"},
                        "city": {"type": "string"},
                        "countryCode": {"type": "string"},
                        "region": {"type": "string"}
                    }
                },
                "profiles": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "network": {"type": "string"},
                            "username": {"type": "string"},
                            "url": {"type": "string", "format": "uri"}
                        }
                    }
                }
            }
        },
        "work": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "location": {"type": "string"},
                    "description": {"type": "string"},
                    "position": {"type": "string"},
                    "url": {"type": "string", "format": "uri"},
                    "startDate": {"$ref": "#/definitions/iso8601"},
                    "endDate": {"$ref": "#/definitions/iso8601"},
                    "summary": {"type": "string"},
                    "highlights": {"type": "array", "items": {"type": "string"}}
                }
            }
        },
        "education": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "institution": {"type": "string"},
                    "url": {"type": "string", "format": "uri"},
                    "area": {"type": "string"},
                    "studyType": {"type": "string"},
                    "startDate": {"$ref": "#/definitions/iso8601"},
                    "endDate": {"$ref": "#/definitions/iso8601"},
                    "score": {"type": "string"},
                    "courses": {"type": "array", "items": {"type": "string"}}
                }
            }
        },
        "skills": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "level": {"type": "string"},
                    "keywords": {"type": "array", "items": {"type": "string"}}
                }
            }
        },
        "projects": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "description": {"type": "string"},
                    "highlights": {"type": "array", "items": {"type": "string"}},
                    "keywords": {"type": "array", "items": {"type": "string"}},
                    "startDate": {"$ref": "#/definitions/iso8601"},
                    "endDate": {"$ref": "#/definitions/iso8601"},
                    "url": {"type": "string", "format": "uri"},
                    "roles": {"type": "array", "items": {"type": "string"}},
                    "entity": {"type": "string"},
                    "type": {"type": "string"}
                }
            }
        }
    },
    "title": "Resume Schema",
    "type": "object"
}

print("Resume schema loaded successfully!")
print(f"Schema has {len(RESUME_SCHEMA['properties'])} main sections")


Resume schema loaded successfully!
Schema has 5 main sections


In [41]:
from typing import Optional, List
from pydantic import BaseModel, Field

class BasicInfo(BaseModel):
    """Extract basic personal information from resume."""
    name: Optional[str] = Field(default="", description="Full name of the person")
    email: Optional[str] = Field(default="", description="Email address")
    phone: Optional[str] = Field(default="", description="Phone number")
    location: Optional[str] = Field(default="", description="Location/Address")
    summary: Optional[str] = Field(default="", description="Professional summary or objective")

class WorkExperience(BaseModel):
    """Extract work experience information."""
    company_name: Optional[str] = Field(default="", description="Name of the company")
    position: Optional[str] = Field(default="", description="Job title/position")
    start_date: Optional[str] = Field(default="", description="Start date in YYYY-MM format")
    end_date: Optional[str] = Field(default="", description="End date in YYYY-MM format or 'Present'")
    description: Optional[str] = Field(default="", description="Job description and responsibilities")
    highlights: List[str] = Field(default=[], description="Key achievements or highlights")

class Education(BaseModel):
    """Extract education information."""
    institution: Optional[str] = Field(default="", description="Name of educational institution")
    degree: Optional[str] = Field(default="", description="Degree or study type")
    field_of_study: Optional[str] = Field(default="", description="Field of study or area")
    start_date: Optional[str] = Field(default="", description="Start date in YYYY format")
    end_date: Optional[str] = Field(default="", description="End date in YYYY format")
    score: Optional[str] = Field(default="", description="GPA or score if mentioned")

class Skills(BaseModel):
    """Extract skills information."""
    category: Optional[str] = Field(default="", description="Skill category (e.g., Technical, Programming)")
    skills_list: List[str] = Field(default=[], description="List of skills in this category")

class Projects(BaseModel):
    """Extract project information."""
    name: Optional[str] = Field(default="", description="Project name")
    description: Optional[str] = Field(default="", description="Project description")
    technologies: List[str] = Field(default=[], description="Technologies used")
    highlights: List[str] = Field(default=[], description="Key achievements or features")
    start_date: Optional[str] = Field(default="", description="Start date")
    end_date: Optional[str] = Field(default="", description="End date")

tools = [BasicInfo, WorkExperience, Education, Skills, Projects]

print("Fixed Pydantic models defined for structured extraction!")

llm_with_tools = llm.bind_tools(tools)
print("LLM re-initialized with fixed tool calling capabilities!")

Fixed Pydantic models defined for structured extraction!
LLM re-initialized with fixed tool calling capabilities!


In [42]:
llm = init_chat_model("meta-llama/llama-4-maverick-17b-128e-instruct", model_provider="groq")
llm_with_tools = llm.bind_tools(tools)

print("LLM initialized with tool calling capabilities!")

LLM initialized with tool calling capabilities!


In [43]:
def analyze_schema_complexity(schema: Dict[str, Any]) -> Dict[str, Any]:
    """Analyze JSON schema complexity to determine processing strategy."""
    
    def count_nested_objects(obj, level=0):
        count = 0
        max_depth = level
        
        if isinstance(obj, dict):
            if obj.get('type') == 'object':
                count += 1
            
            for key, value in obj.items():
                if isinstance(value, (dict, list)):
                    nested_count, nested_depth = count_nested_objects(value, level + 1)
                    count += nested_count
                    max_depth = max(max_depth, nested_depth)
        
        elif isinstance(obj, list):
            for item in obj:
                if isinstance(item, (dict, list)):
                    nested_count, nested_depth = count_nested_objects(item, level + 1)
                    count += nested_count
                    max_depth = max(max_depth, nested_depth)
        
        return count, max_depth
    
    nested_objects, max_nesting_level = count_nested_objects(schema)
    schema_str = json.dumps(schema)
    schema_token_count = count_tokens(schema_str)
    
    if nested_objects < 20 and max_nesting_level < 4:
        complexity = "simple"
        recommended_chunks = 1
    elif nested_objects < 50 and max_nesting_level < 6:
        complexity = "medium"
        recommended_chunks = 2
    else:
        complexity = "complex"
        recommended_chunks = 3
    
    return {
        "nested_objects": nested_objects,
        "max_nesting_level": max_nesting_level,
        "schema_token_count": schema_token_count,
        "complexity": complexity,
        "recommended_chunks": recommended_chunks
    }

complexity_analysis = analyze_schema_complexity(RESUME_SCHEMA)
print("Schema Complexity Analysis:")
print(f"- Nested objects: {complexity_analysis['nested_objects']}")
print(f"- Max nesting level: {complexity_analysis['max_nesting_level']}")
print(f"- Schema token count: {complexity_analysis['schema_token_count']}")
print(f"- Complexity: {complexity_analysis['complexity']}")
print(f"- Recommended chunks: {complexity_analysis['recommended_chunks']}")

Schema Complexity Analysis:
- Nested objects: 8
- Max nesting level: 7
- Schema token count: 850
- Complexity: complex
- Recommended chunks: 3


In [44]:
def convert_text_to_json(text: str, schema: Dict[str, Any], confidence_threshold: float = 0.7):
    """
    Convert unstructured text to structured JSON following the given schema.
    
    Args:
        text: Input text to convert
        schema: JSON schema to follow
        confidence_threshold: Minimum confidence for field acceptance
    
    Returns:
        Dict containing the structured data and confidence scores
    """
    
    complexity_analysis = analyze_schema_complexity(schema)
    print(f"Processing with {complexity_analysis['complexity']} complexity...")
    
    text_tokens = count_tokens(text)
    print(f"Input text tokens: {text_tokens}")
    
    max_context = 25000  
    
    if text_tokens > max_context:
        print(f"Text too large ({text_tokens} tokens), chunking into smaller parts...")
        chunks = chunk_text(text, chunk_size=15000, chunk_overlap=500)
        print(f"Created {len(chunks)} chunks")
    else:
        chunks = [text]
        print("Processing as single chunk")
    
    all_results = []
    
    for i, chunk in enumerate(chunks):
        print(f"\nProcessing chunk {i+1}/{len(chunks)}...")
        
        extraction_prompt = f"""
        You are an expert at extracting structured information from unstructured text.
        
        Extract information from the following text and structure it according to the provided tools.
        
        IMPORTANT INSTRUCTIONS:
        1. Extract ALL relevant information you can find
        2. Use empty string ("") for missing text fields, not null
        3. Use empty array ([]) for missing list fields, not null
        4. Format dates as YYYY-MM-DD, YYYY-MM, or YYYY when available
        5. If information is not available in the text, provide empty string/array as appropriate
        6. Be thorough and accurate - extract everything you can see
        7. For work experience, extract company name, position, dates, and all bullet points
        8. For education, extract institution, degree, field of study, and scores
        9. For projects, extract name, description, technologies, and achievements
        10. For skills, group them into logical categories
        
        Text to extract from:
        {chunk}
        
        Use the provided tools to extract information systematically. Call each tool type multiple times if you find multiple instances (e.g., multiple work experiences, education entries, etc.).
        """
        
        messages = [HumanMessage(extraction_prompt)]
        
        try:
            ai_response = llm_with_tools.invoke(messages)
            
            if hasattr(ai_response, 'tool_calls') and ai_response.tool_calls:
                print(f"Found {len(ai_response.tool_calls)} tool calls")
                chunk_results = {}
                
                for tool_call in ai_response.tool_calls:
                    tool_name = tool_call['name']
                    tool_args = tool_call['args']
                    print(f"Extracted {tool_name}: {len(str(tool_args))} characters")
                    
                    if tool_name not in chunk_results:
                        chunk_results[tool_name] = []
                    chunk_results[tool_name].append(tool_args)
                
                all_results.append(chunk_results)
            else:
                print("No structured data extracted from this chunk")
                
        except Exception as e:
            print(f"Error processing chunk {i+1}: {str(e)}")
            continue
    
    if all_results:
        final_result = merge_chunk_results(all_results)
        return final_result
    else:
        print("No results extracted from any chunks")
        return {
            "basics": {},
            "work": [],
            "education": [],
            "skills": [],
            "projects": []
        }

def merge_chunk_results(chunk_results: List[Dict]) -> Dict:
    """Merge results from multiple chunks into final structured format."""
    
    merged = {
        "basics": {},
        "work": [],
        "education": [],
        "skills": [],
        "projects": []
    }
    
    for chunk_result in chunk_results:
        if "BasicInfo" in chunk_result:
            for basic_info in chunk_result["BasicInfo"]:
                for key, value in basic_info.items():
                    if value and str(value).strip():  
                        merged["basics"][key] = value
        
        if "WorkExperience" in chunk_result:
            for work in chunk_result["WorkExperience"]:
                merged["work"].append({
                    "name": work.get("company_name", ""),
                    "position": work.get("position", ""),
                    "startDate": work.get("start_date", ""),
                    "endDate": work.get("end_date", ""),
                    "summary": work.get("description", ""),
                    "highlights": work.get("highlights", [])
                })
        
        if "Education" in chunk_result:
            for edu in chunk_result["Education"]:
                merged["education"].append({
                    "institution": edu.get("institution", ""),
                    "studyType": edu.get("degree", ""),
                    "area": edu.get("field_of_study", ""),
                    "startDate": edu.get("start_date", ""),
                    "endDate": edu.get("end_date", ""),
                    "score": edu.get("score", "")
                })
        
        if "Skills" in chunk_result:
            for skill in chunk_result["Skills"]:
                merged["skills"].append({
                    "name": skill.get("category", ""),
                    "keywords": skill.get("skills_list", [])
                })
        
        if "Projects" in chunk_result:
            for project in chunk_result["Projects"]:
                merged["projects"].append({
                    "name": project.get("name", ""),
                    "description": project.get("description", ""),
                    "keywords": project.get("technologies", []),
                    "highlights": project.get("highlights", []),
                    "startDate": project.get("start_date", ""),
                    "endDate": project.get("end_date", "")
                })
    
    return merged

print("Improved conversion function defined!")

Improved conversion function defined!


In [45]:
def process_resume_file(file_path: str) -> Dict[str, Any]:
    """
    Complete pipeline to process a resume file and convert to structured JSON.
    
    Args:
        file_path: Path to the resume file
        
    Returns:
        Structured JSON data following the resume schema
    """
    
    print(f"Processing resume: {file_path}")
    print("=" * 50)
    
    try:
        file_type = identify_file_type(file_path)
        text = parse_file(file_path, file_type)
        
        print(f"File parsed successfully!")
        print(f"   - File type: {file_type}")
        print(f"   - Text length: {len(text)} characters")
        print(f"   - Token count: {count_tokens(text)}")
        
        structured_data = convert_text_to_json(text, RESUME_SCHEMA)
        
        print("\nConversion completed!")
        
        print("\n🔍 Step 3: Basic validation...")
        required_sections = ["basics", "work", "education", "skills", "projects"]
        
        validation_results = {}
        for section in required_sections:
            if section in structured_data and structured_data[section]:
                validation_results[section] = "✅ Present"
            else:
                validation_results[section] = "⚠️ Missing or empty"
        
        print("Validation Results:")
        for section, status in validation_results.items():
            print(f"   - {section}: {status}")
        
        return {
            "status": "success",
            "data": structured_data,
            "validation": validation_results,
            "metadata": {
                "file_type": file_type,
                "original_length": len(text),
                "token_count": count_tokens(text),
                "processing_timestamp": datetime.now().isoformat()
            }
        }
        
    except Exception as e:
        print(f"❌ Error processing file: {str(e)}")
        return {
            "status": "error",
            "error": str(e),
            "data": None
        }


In [46]:

resume_file_path = "C:\\Users\\Admin\\OneDrive\\Desktop\\JankiGabaniResume_.pdf"
# resume_file_path = "data\\deedy-resume-reversed.pdf"

result = process_resume_file(resume_file_path)

if result["status"] == "success":
    print("\nSUCCESS! Resume processed successfully!")
    print("\n EXTRACTED DATA:")
    print("=" * 50)
    
    structured_json = json.dumps(result["data"], indent=2, ensure_ascii=False)
    print(structured_json)
    
    print(f"\nMETADATA:")
    print(f"   - File type: {result['metadata']['file_type']}")
    print(f"   - Original length: {result['metadata']['original_length']} chars")
    print(f"   - Token count: {result['metadata']['token_count']}")
    print(f"   - Processing time: {result['metadata']['processing_timestamp']}")
    
else:
    print(f"\nFAILED to process resume:")
    print(f"   Error: {result['error']}")


Processing resume: C:\Users\Admin\OneDrive\Desktop\JankiGabaniResume_.pdf
File parsed successfully!
   - File type: pdf
   - Text length: 4587 characters
   - Token count: 996
Processing with complex complexity...
Input text tokens: 996
Processing as single chunk

Processing chunk 1/1...
Found 11 tool calls
Extracted BasicInfo: 119 characters
Extracted Skills: 473 characters
Extracted Skills: 186 characters
Extracted WorkExperience: 1049 characters
Extracted WorkExperience: 729 characters
Extracted WorkExperience: 497 characters
Extracted Education: 169 characters
Extracted Education: 138 characters
Extracted Education: 143 characters
Extracted Projects: 348 characters
Extracted Projects: 474 characters

Conversion completed!

🔍 Step 3: Basic validation...
Validation Results:
   - basics: ✅ Present
   - work: ✅ Present
   - education: ✅ Present
   - skills: ✅ Present
   - projects: ✅ Present

SUCCESS! Resume processed successfully!

 EXTRACTED DATA:
{
  "basics": {
    "email": "jankig

In [47]:
if result["status"] == "success":
    output_filename = "extracted_resume.json"
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(result["data"], f, indent=2, ensure_ascii=False)
    
    print(f"Results saved to: {output_filename}")
    
    metadata_filename = "extraction_metadata.json"
    with open(metadata_filename, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    
    print(f"Full results with metadata saved to: {metadata_filename}")
else:
    print("No results to save due to processing error")

Results saved to: extracted_resume.json
Full results with metadata saved to: extraction_metadata.json
