<a href="https://colab.research.google.com/github/jagrit-goyal/Deep-Learning/blob/main/Job_Resume_Matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -q pymupdf
!pip install -q docx2txt

In [4]:
import pymupdf  # PyMuPDF for PDF
import docx2txt  # for DOCX
import os

def extract_text_from_pdf(file_path):
    """Extract text from PDF using PyMuPDF"""
    text = ""
    pdf_doc = pymupdf.open(file_path)
    for page in pdf_doc:
        text += page.get_text("text")
    pdf_doc.close()
    return text

def extract_text_from_docx(file_path):
    """Extract text from DOCX using docx2txt"""
    return docx2txt.process(file_path)

def extract_resume_text(file_path):
    """Auto-detect file type and extract text"""
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format. Use PDF or DOCX.")

if __name__ == "__main__":
    # Example usage
    resume_file = "/content/resume.pdf"
    text = extract_resume_text(resume_file)
    print("Extracted Resume Text:\n")
    print(text[:1000])  # print first 1000 chars

Extracted Resume Text:

Jagrit Goyal
+91-7814768600 | jagritgoyal31@gmail.com | linkedin.com/in/jagrit31/ | github.com/jagrit-goyal
Education
Thapar Institute of Engineering and Technology
Patiala, Punjab
B.E. in Computer Science (Minor: Data Science) — CGPA: 8.43
Sep 2022 – May 2026
Experience
ELC — PCB Component Detection and Layout Automation
Jun 2025 – Aug 2025
• Led the end-to-end development of a YOLO-based deep learning model, achieving 93% accuracy on
high-complexity PCB images.
• Implemented OCR to extract PCB component labels, automating data extraction for 92% of cases.
• Collaborated with a 5-member team and streamlined the PCB layout generation process by integrating object
detection and OCR into a unified pipeline, resulting in a 45% reduction in processing time.
Projects
ELC Connect – Smart Room Booking System | Next.js, Mongo DB, REST APIs, JWT Authentication
• Engineered a full-stack web application integrating room booking, inventory, attendance, and security modules,

In [5]:
!pip install -q spacy



In [6]:
SKILLS = [
    # Programming Languages
    "python", "java", "c", "c++", "c#", "go", "r", "scala", "kotlin",
    "swift", "javascript", "typescript", "html", "css", "php", "ruby",
    "shell scripting", "bash", "matlab",

    # Data Science & Machine Learning
    "machine learning", "deep learning", "natural language processing", "nlp",
    "computer vision", "data analysis", "data visualization", "feature engineering",
    "model deployment", "model evaluation", "predictive modeling", "statistics",
    "tensorflow", "keras", "pytorch", "scikit-learn", "xgboost", "lightgbm",
    "pandas", "numpy", "matplotlib", "seaborn", "plotly", "openai", "huggingface",
    "transformers", "bert", "llm", "gensim", "spacy",

    # Databases
    "sql", "mysql", "postgresql", "mongodb", "oracle", "sqlite",
    "redis", "cassandra", "elasticsearch", "firebase",

    # Web Development
    "react", "react.js", "next.js", "node.js", "express.js", "angular",
    "vue.js", "django", "flask", "fastapi", "spring boot", "laravel", "bootstrap",
    "api development", "rest api", "graphql",

    # DevOps & Cloud
    "docker", "kubernetes", "aws", "azure", "gcp", "google cloud", "jenkins",
    "git", "github", "gitlab", "bitbucket", "ci/cd", "terraform", "ansible",
    "linux", "nginx",

    # Data Engineering & Big Data
    "hadoop", "spark", "pyspark", "airflow", "kafka", "snowflake", "databricks",
    "etl", "data pipeline", "big data", "data warehouse",

    # Tools & Other Tech
    "excel", "tableau", "power bi", "jupyter", "colab", "visual studio code",
    "jira", "notion", "microsoft office", "slack", "figma", "canva",

    # Soft Skills / General
    "communication", "teamwork", "leadership", "problem solving",
    "project management", "time management", "critical thinking"
]

In [7]:
# skill_extractor.py

import spacy
import re

# Load spaCy pre_trained model
nlp = spacy.load("en_core_web_sm") # small, general-purpose English model

def extract_entities(text):
    """Extract entities like education, organizations, names, dates"""
    doc = nlp(text)
    entities = {"ORG": [], "PERSON": [], "DATE": [], "GPE": []}
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    return entities

def extract_skills(text):
    """Extract skills by simple dictionary matching"""
    text_lower = text.lower()
    found_skills = []
    for skill in SKILLS:
        # use regex for whole-word matching
        if re.search(r"\b" + re.escape(skill.lower()) + r"\b", text_lower):
            found_skills.append(skill)
    return list(set(found_skills))  # remove duplicates

if __name__ == "__main__":
    # Extract skills & entities
    skills = extract_skills(text)
    entities = extract_entities(text)

    print("Extracted Skills:", skills)
    print("Extracted Entities:", entities)


Extracted Skills: ['next.js', 'python', 'c', 'git', 'data analysis', 'react.js', 'tensorflow', 'sql', 'feature engineering', 'seaborn', 'matplotlib', 'express.js', 'css', 'deep learning', 'keras', 'machine learning', 'node.js', 'leadership', 'r', 'github', 'numpy', 'html', 'javascript', 'data visualization', 'scikit-learn', 'react', 'mongodb', 'pandas']
Extracted Entities: {'ORG': ['Thapar Institute of Engineering and Technology\nPatiala', 'PCB Component Detection', 'YOLO', 'PCB', '• Implemented OCR', 'PCB', 'PCB', 'OCR', 'JWT Authentication\n• Engineered', '• Implemented', '• Built', 'BiteXpress', 'Express.js', 'MERN', 'Thapar University', 'Express.js', 'Linear Regression', 'Random Forest', 'Extra Curricular\n', 'TCS', '• Crafted', 'Technical Skills\nProgramming: Python', 'C/C++', 'SQL', 'SQL', 'JavaScript', 'HTML/CSS\nLibraries', 'TensorFlow', 'Keras\nTools & Frameworks: React.js', 'MERN Stack', 'DBMS\nAchievements & Certifications\n• Awarded', '• Mastered', 'Cisco Data Analytics Ess

In [8]:
!pip install -q sentence-transformers

In [9]:
# paste job descriptio here :-
# Example Job Description
job_description = """
Responsibilities:
Design, develop, and deploy end-to-end web applications using the MERN stack (MongoDB, Express.js, React.js, Node.js).
Collaborate with cross-functional teams to integrate APIs, authentication systems (JWT), and analytics dashboards.
Work on data preprocessing, feature engineering, and predictive modeling using Python and Scikit-learn.
Contribute to AI-powered automation projects, such as object detection and OCR pipelines using TensorFlow/Keras.
Optimize backend APIs and frontend performance for scalability, maintainability, and responsiveness.
Leverage version control (Git/GitHub) for collaborative development and CI/CD pipelines.
Translate business or research requirements into clean, maintainable, and efficient code.

Required Skills:
Programming: Python, C/C++, JavaScript, SQL
Web Development: React.js, Node.js, Express.js, MongoDB
Machine Learning: Pandas, NumPy, Scikit-learn, TensorFlow, Keras
Data Visualization: Matplotlib, Seaborn, Power BI
Core CS Fundamentals: DSA, OOP, OS, CN, DBMS
Tools: Git, VS Code, Postman, Jupyter
"""

In [10]:
from sentence_transformers import SentenceTransformer, util

# Load BERT model (MiniLM is lightweight & fast)
model = SentenceTransformer('all-MiniLM-L6-v2')

def bert_resume_match(text, job_description):
    # Create embeddings
    embeddings = model.encode([text, job_description], convert_to_tensor=True)

    # Cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1])
    return float(similarity)



# Resume text = your extracted text
score = bert_resume_match(text, job_description)
print(f"BERT Resume–Job Match Score: {score:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BERT Resume–Job Match Score: 0.42


In [11]:
#getting complete report

def get_resume_match_report(resume_text, job_description):
    # Extract skills from both
    resume_skills = extract_skills(resume_text)
    jd_skills = extract_skills(job_description)

    # Find matched and missing skills
    matched_skills = list(set(resume_skills) & set(jd_skills))
    missing_skills = list(set(jd_skills) - set(resume_skills))

    # Compute semantic similarity
    bert_score = bert_resume_match(resume_text, job_description)

    # Compute skill match ratio
    skill_score = len(matched_skills) / len(jd_skills) if jd_skills else 0

    # Weighted final score (70% BERT + 30% Skill overlap)
    final_score = round((0.7 * bert_score) + (0.3 * skill_score), 2)

    # Return complete report
    return {
        "BERT_Similarity": round(bert_score, 2),
        "Skill_Match_Ratio": round(skill_score, 2),
        "Final_Weighted_Score": final_score,
        "Matched_Skills": matched_skills,
        "Missing_Skills": missing_skills
    }

  # Generate report
report = get_resume_match_report(text, job_description)

# Display results
print("Resume–Job Match Report:")
for k, v in report.items():
    print(f"{k}: {v}")

Resume–Job Match Report:
BERT_Similarity: 0.42
Skill_Match_Ratio: 0.84
Final_Weighted_Score: 0.54
Matched_Skills: ['python', 'c', 'git', 'react.js', 'tensorflow', 'sql', 'feature engineering', 'seaborn', 'matplotlib', 'express.js', 'keras', 'machine learning', 'node.js', 'github', 'numpy', 'javascript', 'data visualization', 'scikit-learn', 'react', 'mongodb', 'pandas']
Missing_Skills: ['jupyter', 'predictive modeling', 'ci/cd', 'power bi']
