In [9]:
import zipfile
import os

zip_path = r"C:\Users\Ganesh\Downloads\Dataset.zip"
extract_path = r"C:\Users\Ganesh\Downloads\Dataset_Extracted"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [10]:
root_folder = r"C:\Users\Ganesh\Downloads\Dataset_Extracted"

print("Subfolders:")
for f in os.listdir(root_folder):
    full_path = os.path.join(root_folder, f)
    if os.path.isdir(full_path):
        print(f"'{f}'")


Subfolders:
'Dataset'


In [11]:
all_pdfs = glob.glob(os.path.join(root_folder, "**", "*.pdf"), recursive=True)
print(f"Found {len(all_pdfs)} PDFs:")
for path in all_pdfs:
    print(path)



Found 200 PDFs:
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1061.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1070.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1080.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1161.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1164.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1191.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1212.pdf
C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1

In [12]:
# Run this in a Jupyter Notebook cell
!pip install langchain langchain-community langchain-openai




In [13]:
from pydantic import BaseModel, Field
from typing import List, Optional
# Education Section
class Education(BaseModel):
    university_name: str = Field(..., description='Name of the university')
    degree: str = Field(..., description='Degree Obtained')
    gpa: Optional[float] = Field(None, ge=0, le=10.0, description='GPA')
        

In [14]:
# Experience Section
class Experience(BaseModel):
    company_name: Optional[str] = Field(..., description='Name of the company')
    n_years: Optional[int] = Field(..., ge=0, description='Years of experience in the company')
    project_name: Optional[str] = Field(..., description='Main project name')
    project_description: Optional[str] = Field(..., description='Project description & role')
    tech_stack: Optional[str] = Field(..., description='Technologies/tools used')

In [15]:
# Resume Schema
class Resume(BaseModel):
    name: str = Field(..., description='Candidate full name')
    age: Optional[int] = Field(None, ge=0, description='Age of the candidate')
    email: str = Field(..., description='Email address')
    phone_number: str = Field(..., description='Phone number')
    experience: Optional[List[Experience]] = Field(..., description='Work experience')
    education: Optional[List[Education]] = Field(..., description='Education background')
    languages: Optional[str] = Field(..., description='Languages known')


In [16]:
from langchain.prompts import PromptTemplate

resume_template = """
You are an AI assistant tasked with extracting structured information from a technical resume.

Only Extract the information that's present in the Resume class.

Resume Context:
{resume_text}
"""

prompt_template = PromptTemplate(
    template=resume_template,
    input_variables=['resume_text']
)

In [17]:
import os

# Set your API key securely (DO NOT share it publicly)
os.environ["OPENAI_API_KEY"] = "-your api secret key-"

# Now initialize your model
from langchain.chat_models import init_chat_model

model = init_chat_model(
    model='gpt-4o-mini',
    model_provider='openai'
).with_structured_output(Resume, method="function_calling")


In [25]:
!pip install pypdf

from langchain_community.document_loaders import PyPDFLoader

pdf_path = r"C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1161.pdf"

loader = PyPDFLoader(pdf_path)
docs = loader.load()
resume_text = "\n".join([doc.page_content for doc in docs])

print(resume_text[:500])  # Optional: print first 500 characters for inspection


Candidate Resume (ID: C1161)
Name: Richard Molina
Email: richardmolina72@gmail.com
Phone: +1-908-7110
Education
Bachelor of Engineering in Information Technology (2014-2018)
Concentrated on database management, networking, and cybersecurity.
Master of Business Administration (2017-2019)
Focused on Business Strategy, Financial Analysis, and Operations Management.
Bachelor of Science in Computer Science (2015-2019)
Specialized in Software Development and AI, with projects in deep learning and clou


In [26]:
pip install langchain langchain-community langchain-groq pypdf


Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import glob
import zipfile
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFLoader

# ----------- Step 1: Extract ZIP File -----------
zip_path = r"C:\Users\Ganesh\Downloads\Dataset.zip"
extract_path = r"C:\Users\Ganesh\Downloads\Dataset_Extracted"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

root_folder = extract_path
print("Subfolders:")
for f in os.listdir(root_folder):
    full_path = os.path.join(root_folder, f)
    if os.path.isdir(full_path):
        print(f"'{f}'")

# ----------- Step 2: Enhanced Resume Schema -----------
class Education(BaseModel):
    university_name: str
    degree: str
    gpa: Optional[float] = Field(None, ge=0, le=10.0)

class Experience(BaseModel):
    company_name: Optional[str]
    n_years: Optional[int] = Field(None, ge=0)
    project_name: Optional[str]
    project_description: Optional[str]
    tech_stack: Optional[str]

class Resume(BaseModel):
    name: str
    age: Optional[int] = Field(None, ge=0)
    email: str
    phone_number: str
    experience: Optional[List[Experience]]
    education: Optional[List[Education]]
    languages: Optional[str]
    skills: Optional[str]
    address: Optional[str]
    linkedin: Optional[str]

# ----------- Step 3: Groq API Key -----------
os.environ["GROQ_API_KEY"] = "your api secret key"

# ----------- Step 4: Initialize Model via Groq -----------
llm = ChatGroq(
    temperature=0,
    model_name="llama3-70b-8192"

)

parser = PydanticOutputParser(pydantic_object=Resume)

prompt = PromptTemplate(
    template="""
You are an AI assistant extracting structured data from a resume.

Only extract information that is present in the resume and relevant to the following schema:
{format_instructions}

Resume Content:
{resume_text}
""",
    input_variables=["resume_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

# ----------- Step 5: Load All PDFs and Parse -----------

pdf_paths = glob.glob(os.path.join(root_folder, "**", "*.pdf"), recursive=True)
print(f"Found {len(pdf_paths)} PDFs.")

for pdf_path in pdf_paths:
    print(f"\nProcessing: {os.path.basename(pdf_path)}")
    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        resume_text = "\n".join([doc.page_content for doc in docs])

        formatted_prompt = prompt.format_prompt(resume_text=resume_text)
        result = llm.invoke(formatted_prompt.to_string())
        parsed = parser.invoke(result)

        print(parsed.json(indent=2))
    except Exception as e:
        print(f"Failed to process {pdf_path}: {e}")


Subfolders:
'Dataset'
Found 200 PDFs.

Processing: C1061.pdf
Failed to process C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1061.pdf: Failed to parse Resume from completion {"name": "Alyssa Chavez", "email": "alyssachavez88@gmail.com", "phone_number": "+1-465-3587", "experience": [{"company_name": "ABC Inc.", "n_years": 4, "project_name": null, "project_description": "Built predictive models that enhanced decision-making processes, reducing operational costs by 25%.", "tech_stack": null}], "education": [{"university_name": null, "degree": "Diploma in Software Engineering", "gpa": null}], "languages": null, "skills": "Cybersecurity", "address": null, "linkedin": null}. Got: 1 validation error for Resume
education.0.university_name
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
For troubleshootin

In [None]:
job_description = """
We are hiring a Data Analyst with 2+ years experience in SQL, Python, and Power BI.
Familiarity with data cleaning, exploratory analysis, and dashboarding is a must.
Experience in financial or e-commerce domains preferred.
"""


In [2]:
pip install langchain langchain-community langchain-groq pypdf


Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
os.environ["GROQ_API_KEY"] = "your api secret key"

In [6]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Education(BaseModel):
    university_name: str
    degree: str
    gpa: Optional[float] = Field(None, ge=0, le=10.0)

class Experience(BaseModel):
    company_name: Optional[str]
    n_years: Optional[int]
    project_name: Optional[str]
    project_description: Optional[str]
    tech_stack: Optional[str]

class Resume(BaseModel):
    name: str
    age: Optional[int]
    email: str
    phone_number: str
    experience: Optional[List[Experience]]
    education: Optional[List[Education]]
    languages: Optional[str]
    skills: Optional[str]
    address: Optional[str]
    linkedin: Optional[str]


In [2]:
import os
import json
from typing import List, Optional
from pydantic import BaseModel, Field
from dotenv import load_dotenv

# Setup GROQ API Key (either via .env or hardcoded for testing only)
os.environ["GROQ_API_KEY"] = "your api secret key"

# Define schema
class Education(BaseModel):
    university_name: Optional[str] = None
    degree: Optional[str] = None
    gpa: Optional[float] = Field(None, ge=0, le=10.0)

class Experience(BaseModel):
    company_name: Optional[str]
    n_years: Optional[int]
    project_name: Optional[str]
    project_description: Optional[str]
    tech_stack: Optional[str]

class Resume(BaseModel):
    name: str
    age: Optional[int]
    email: str
    phone_number: str
    experience: Optional[List[Experience]]
    education: Optional[List[Education]]
    languages: Optional[str]
    skills: Optional[str]
    address: Optional[str]
    linkedin: Optional[str]

# LLM setup
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_community.document_loaders import PyPDFLoader

llm = ChatGroq(
    model_name="llama3-70b-8192",
    temperature=0
)

# Output parser
parser = PydanticOutputParser(pydantic_object=Resume)

# Prompt template
prompt = PromptTemplate(
    template="""
You are an AI assistant that extracts structured data from resumes.

Only return fields based on the schema:
{format_instructions}

⚠️ Ensure all important fields like university name and degree are not null. If not available, return "Unknown".

Respond ONLY with a valid JSON object without explanation, markdown, or commentary.

Resume Text:
{resume_text}
""",
    input_variables=["resume_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

# Load PDF
pdf_path = r"C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1161.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
resume_text = "\n".join([doc.page_content for doc in docs])

# Format and invoke prompt
formatted_prompt = prompt.format_prompt(resume_text=resume_text)
response = llm.invoke(formatted_prompt.to_string())

# Extract clean JSON from response
response_text = response.content if hasattr(response, 'content') else str(response)

# Remove code fences and extract valid JSON
def extract_json_from_response(text: str) -> str:
    if "```" in text:
        parts = text.split("```")
        for part in parts:
            try:
                return json.loads(part.strip())
            except json.JSONDecodeError:
                continue
        raise ValueError("No valid JSON block found in code fences.")
    else:
        return json.loads(text.strip())

try:
    raw_json = extract_json_from_response(response_text)
except Exception as e:
    print("❌ Could not parse JSON from LLM response. Raw response:\n", response_text)
    raise e

# Parse into Pydantic model
parsed_resume = Resume(**raw_json)

# Pretty print output
print(parsed_resume.model_dump_json(indent=2))



{
  "name": "Richard Molina",
  "age": null,
  "email": "richardmolina72@gmail.com",
  "phone_number": "+1-908-7110",
  "experience": [
    {
      "company_name": "DEF Ltd.",
      "n_years": 4,
      "project_name": "Unknown",
      "project_description": "Led cross-functional teams to develop innovative solutions, increasing product adoption by 40%.",
      "tech_stack": "Unknown"
    },
    {
      "company_name": "XYZ Corp",
      "n_years": 4,
      "project_name": "Unknown",
      "project_description": "Developed scalable backend applications, improved system efficiency by 30%, and led agile development sprints.",
      "tech_stack": "Unknown"
    }
  ],
  "education": [
    {
      "university_name": "Unknown",
      "degree": "Bachelor of Engineering in Information Technology",
      "gpa": null
    },
    {
      "university_name": "Unknown",
      "degree": "Master of Business Administration",
      "gpa": null
    },
    {
      "university_name": "Unknown",
      "degree"

In [24]:
!pip install sentence-transformers scikit-learn






In [1]:
pip install --force-reinstall sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Collecting torch>=1.11.0
  Using cached torch-2.7.1-cp39-cp39-win_amd64.whl (216.0 MB)
Collecting huggingface-hub>=0.20.0
  Using cached huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
Collecting scipy
  Using cached scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting Pillow
  Using cached pillow-11.3.0-cp39-cp39-win_amd64.whl (7.0 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
Collecting transformers<5.0.0,>=4.41.0
  Using cached transformers-4.53.1-py3-none-any.whl (10.8 MB)
Collecting typing_extensions>=4.5.0
  Using cached typing_extensions-4.14.1-py3-none-any.whl (43 kB)
Collecting filelock
  Using cached filelock-3.18.0-py3-none-any.whl (16 kB)
Collecting pyyaml>=5.1
  Using cached PyYAML-6.0.2-cp39-cp39-win_amd64.whl (162 kB)
Collecting packaging>=20.9
  Us

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Ganesh\\anaconda3\\Lib\\site-packages\\~.mpy.libs\\libscipy_openblas64_-caad452230ae4ddb57899b8b3a33c55c.dll'
Consider using the `--user` option or check the permissions.



In [1]:
import os
import json
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_community.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ---- SETUP ----
os.environ["GROQ_API_KEY"] = "your api secret key"

# ---- SCHEMA ----
class Education(BaseModel):
    university_name: Optional[str] = None
    degree: Optional[str] = None
    gpa: Optional[float] = Field(None, ge=0, le=10.0)

class Experience(BaseModel):
    company_name: Optional[str]
    n_years: Optional[int]
    project_name: Optional[str]
    project_description: Optional[str]
    tech_stack: Optional[str]

class Resume(BaseModel):
    name: str
    age: Optional[int]
    email: str
    phone_number: str
    experience: Optional[List[Experience]]
    education: Optional[List[Education]]
    languages: Optional[str]
    skills: Optional[str]
    address: Optional[str]
    linkedin: Optional[str]

# ---- LLM SETUP ----
llm = ChatGroq(model_name="llama3-70b-8192", temperature=0)
parser = PydanticOutputParser(pydantic_object=Resume)

prompt = PromptTemplate(
    template="""
You are an AI assistant that extracts structured data from resumes.

Only return fields based on the schema:
{format_instructions}

⚠️ Ensure all important fields like university name and degree are not null. If not available, return "Unknown".

Respond ONLY with a valid JSON object without explanation, markdown, or commentary.

Resume Text:
{resume_text}
""",
    input_variables=["resume_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

# ---- LOAD RESUME ----
pdf_path = r"C:\Users\Ganesh\Downloads\Dataset_Extracted\Dataset\[Usecase 5] AI-Powered Job Application Screening System​\CVs1\C1161.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
resume_text = "\n".join([doc.page_content for doc in docs])

# ---- GET RESPONSE ----
formatted_prompt = prompt.format_prompt(resume_text=resume_text)
response = llm.invoke(formatted_prompt.to_string())
response_text = response.content if hasattr(response, 'content') else str(response)

# ---- CLEAN JSON FROM RESPONSE ----
def extract_json_from_response(text: str) -> str:
    if "```" in text:
        parts = text.split("```")
        for part in parts:
            try:
                return json.loads(part.strip())
            except json.JSONDecodeError:
                continue
        raise ValueError("No valid JSON block found in code fences.")
    else:
        return json.loads(text.strip())

try:
    raw_json = extract_json_from_response(response_text)
except Exception as e:
    print("❌ Could not parse JSON from LLM response.\n", response_text)
    raise e

parsed_resume = Resume(**raw_json)

# ---- PRINT STRUCTURED RESUME ----
print("📄 Structured Resume:")
print(parsed_resume.model_dump_json(indent=2))

# ============================
# ✅ JOB MATCHING USING EMBEDDINGS
# ============================
# 1. Define Job Description
job_description = """
We are hiring a Python developer experienced in backend systems, data processing, and machine learning.
Preferred experience with Flask, REST APIs, and working with PostgreSQL or MongoDB.
"""

# 2. Prepare Resume Summary Text (combine fields)
resume_summary = resume_text  # Or use extracted fields like skills + experience

# 3. Load Embedding Model
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight and effective

# 4. Compute Embeddings
resume_emb = embedder.encode([resume_summary])
job_emb = embedder.encode([job_description])

# 5. Compute Similarity
similarity_score = cosine_similarity(resume_emb, job_emb)[0][0]

# 6.Print Results
print("\n📊 Job Match Similarity Score:", round(similarity_score, 3))
if similarity_score > 0.8:
    print("✅ Excellent Match")
elif similarity_score > 0.6:
    print("👍 Good Match")
else:
    print("⚠️ Weak Match - Consider different job or resume updates.")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Ganesh\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Ganesh\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Ganesh\anaconda3\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Ganesh\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\Ganesh\anaconda3\lib\site

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Ganesh\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Ganesh\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Ganesh\anaconda3\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Ganesh\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\Ganesh\anaconda3\lib\site

AttributeError: _ARRAY_API not found

📄 Structured Resume:
{
  "name": "Richard Molina",
  "age": null,
  "email": "richardmolina72@gmail.com",
  "phone_number": "+1-908-7110",
  "experience": [
    {
      "company_name": "DEF Ltd.",
      "n_years": 4,
      "project_name": "Unknown",
      "project_description": "Led cross-functional teams to develop innovative solutions, increasing product adoption by 40%.",
      "tech_stack": "Unknown"
    },
    {
      "company_name": "XYZ Corp",
      "n_years": 4,
      "project_name": "Unknown",
      "project_description": "Developed scalable backend applications, improved system efficiency by 30%, and led agile development sprints.",
      "tech_stack": "Unknown"
    }
  ],
  "education": [
    {
      "university_name": "Unknown",
      "degree": "Bachelor of Engineering in Information Technology",
      "gpa": null
    },
    {
      "university_name": "Unknown",
      "degree": "Master of Business Administration",
      "gpa": null
    },
    {
      "university_name": "Unk

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


📊 Job Match Similarity Score: 0.437
⚠️ Weak Match - Consider different job or resume updates.
