In [None]:
# Resume Ranking Application

This notebook explains the components of a Resume Ranking application that automates the process of screening and ranking candidate resumes against job descriptions.

## Overview

This application:
1. Accepts a job description and multiple resume files (PDF format)
2. Extracts text from the resume documents
3. Validates both the job description and resumes
4. Ranks resumes based on their similarity to the job description
5. Analyzes key skills and qualifications
6. Provides enhanced ranking using advanced NLP techniques
7. Visualizes results through a Streamlit web interface

The application streamlines the initial resume screening process, helping recruiters identify the most promising candidates.

In [None]:
# Install required packages
import sys
import subprocess
import importlib


def check_and_install_packages():
    """
    Check if required packages are installed, and install them if not.
    """
    required_packages = {
        'streamlit': 'For creating the web application interface',
        'pandas': 'For data manipulation and analysis',
        'PyPDF2': 'For extracting text from PDF files',
        'scikit-learn': 'For machine learning utilities including TF-IDF',
        'nltk': 'For natural language processing tasks',
        'matplotlib': 'For data visualization',
        'plotly': 'For interactive visualizations',
        'spacy': 'For advanced NLP processing',
        'textract': 'For extracting text from various document formats',
        'python-docx': 'For handling docx files',
        'docx2txt': 'For converting docx to text',
        'PyMuPDF': 'Alternative PDF processing library',
        'gensim': 'For topic modeling and document similarity',
        'sentence-transformers': 'For embeddings and semantic search'
    }

    for package, purpose in required_packages.items():
        try:
            importlib.import_module(package)
            print(f"✓ {package} is already installed: {purpose}")
        except ImportError:
            print(f"Installing {package}: {purpose}")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

    # Additional NLTK resources
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

    # Load spaCy model
    try:
        import spacy
        try:
            nlp = spacy.load('en_core_web_sm')
            print("✓ spaCy model 'en_core_web_sm' is already installed")
        except:
            print("Installing spaCy model 'en_core_web_sm'")
            subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    except:
        pass


# Run the function to check and install packages
check_and_install_packages()

# Import required libraries
import streamlit as st
import pandas as pd
import numpy as np
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import plotly.express as px
import spacy
import re
import io
import os
from sentence_transformers import SentenceTransformer

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

print("All necessary packages are imported and ready to use!")

In [None]:
def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file.

    Parameters:
    ----------
    pdf_file : file object
        The PDF file to extract text from

    Returns:
    -------
    str
        The extracted text from the PDF
    """
    text = ""

    try:
        # Try using PyPDF2
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text() + "\n"

        # If PyPDF2 returns empty or very little text, try alternative methods
        if len(text.strip()) < 100:
            # Reset file pointer
            pdf_file.seek(0)

            # Try using PyMuPDF (fitz)
            import fitz
            doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
            for page in doc:
                text += page.get_text() + "\n"

    except Exception as e:
        text = f"Error extracting text: {str(e)}"

    return text

# Example usage
# with open('example_resume.pdf', 'rb') as pdf_file:
#     text = extract_text_from_pdf(pdf_file)
#     print(f"Extracted {len(text)} characters from the PDF")

In [None]:
def validate_resume(resume_text):
    """
    Validates if the provided text is a valid resume.

    Parameters:
    ----------
    resume_text : str
        The text extracted from the resume file

    Returns:
    -------
    tuple
        (is_valid, error_message)
    """
    if not resume_text or len(resume_text.strip()) < 100:
        return False, "Resume text is too short or empty"

    # Check for common resume sections
    required_sections = ["experience", "education", "skills"]
    resume_lower = resume_text.lower()

    found_sections = [section for section in required_sections if section in resume_lower]

    if len(found_sections) < 2:  # At least 2 sections should be present
        return False, "Resume doesn't appear to have standard sections (experience, education, skills)"

    # Check for contact information patterns
    has_email = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', resume_text) is not None
    has_phone = re.search(r'(\+\d{1,3}\s?)?(\(\d{1,4}\)|\d{1,4})[\s.-]?\d{3}[\s.-]?\d{4}', resume_text) is not None

    if not (has_email or has_phone):
        return False, "No contact information (email or phone) found"

    return True, ""

In [None]:
def validate_job_description(job_description):
    """
    Validates if the provided text is a valid job description.

    Parameters:
    ----------
    job_description : str
        The text of the job description

    Returns:
    -------
    tuple
        (is_valid, error_message)
    """
    if not job_description or len(job_description.strip()) < 100:
        return False, "Job description is too short or empty"

    # Check for common job description sections or terms
    job_sections = ["responsibilities", "qualifications", "requirements", "skills", "experience"]
    job_lower = job_description.lower()

    found_sections = [section for section in job_sections if section in job_lower]

    if len(found_sections) < 2:  # At least 2 sections should be present
        return False, "Job description doesn't have standard sections (responsibilities, qualifications, requirements, etc.)"

    return True, ""

In [None]:
def rank_resumes(job_description, resume_texts):
    """
    Ranks resumes based on their similarity to the job description using TF-IDF and cosine similarity.

    Parameters:
    ----------
    job_description : str
        The job description text
    resume_texts : list
        List of texts extracted from resume files

    Returns:
    -------
    list
        List of dictionaries containing the similarity score and resume text
    """
    if not resume_texts:
        return []

    # Preprocess the job description
    job_doc = nlp(job_description.lower())
    job_tokens = [token.text for token in job_doc if not token.is_stop and not token.is_punct]
    processed_job = " ".join(job_tokens)

    # Preprocess the resumes
    processed_resumes = []
    for resume in resume_texts:
        doc = nlp(resume.lower())
        tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
        processed_resumes.append(" ".join(tokens))

    # Combine job description and resumes for vectorization
    all_documents = [processed_job] + processed_resumes

    # Calculate TF-IDF vectors
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(all_documents)

    # Calculate cosine similarity between job description and each resume
    job_vector = tfidf_matrix[0:1]
    resume_vectors = tfidf_matrix[1:]
    cosine_similarities = cosine_similarity(job_vector, resume_vectors)

    # Create ranking results
    results = []
    for i, similarity in enumerate(cosine_similarities[0]):
        results.append({
            'similarity_score': similarity,
            'resume_text': resume_texts[i],
            'processed_text': processed_resumes[i]
        })

    # Sort by similarity score in descending order
    results.sort(key=lambda x: x['similarity_score'], reverse=True)

    return results

In [None]:
def process_resumes(job_description, uploaded_files):
    """
    Process uploaded resume files and rank them against the job description.

    Parameters:
    ----------
    job_description : str
        The job description text
    uploaded_files : list
        List of uploaded resume files

    Returns:
    -------
    tuple
        (valid_job, error_message, results, valid_resumes, valid_names)
    """
    # Validate job description
    is_valid_job, error_msg = validate_job_description(job_description)

    if not is_valid_job:
        return is_valid_job, error_msg, [], [], []

    valid_resumes = []
    valid_names = []

    # Process each resume file
    for file in uploaded_files:
        try:
            # Extract text from PDF
            text = extract_text_from_pdf(file)

            # Validate resume
            is_valid, _ = validate_resume(text)

            if is_valid:
                valid_resumes.append(text)
                valid_names.append(file.name)

        except Exception as e:
            print(f"Error processing {file.name}: {str(e)}")

    if not valid_resumes:
        return is_valid_job, "No valid resumes found", [], [], []

    # Rank the valid resumes
    ranked_results = rank_resumes(job_description, valid_resumes)

    # Add file names to results
    for i, result in enumerate(ranked_results):
        result['file_name'] = valid_names[i]

    return is_valid_job, "", ranked_results, valid_resumes, valid_names

In [None]:
def analyze_key_skills(job_description, resume_texts):
    """
    Analyzes key skills mentioned in the job description and identifies them in resumes.

    Parameters:
    ----------
    job_description : str
        The job description text
    resume_texts : list
        List of texts extracted from resume files

    Returns:
    -------
    dict
        A dictionary with skill analysis results
    """
    # Extract potential skills from the job description
    job_doc = nlp(job_description.lower())

    # Get noun chunks and named entities as potential skills
    potential_skills = set()
    for chunk in job_doc.noun_chunks:
        if 2 <= len(chunk.text.split()) <= 4:  # 2-4 word phrases likely to be skills
            potential_skills.add(chunk.text)

    for ent in job_doc.ents:
        if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART"]:
            potential_skills.add(ent.text.lower())

    # Add common technical skills by regex pattern matching
    tech_patterns = [
        r'python|java|c\+\+|javascript|react|node\.js|sql|aws|azure|docker|kubernetes',
        r'machine learning|deep learning|natural language processing|computer vision',
        r'tensorflow|pytorch|scikit-learn|pandas|numpy',
        r'agile|scrum|kanban|project management|leadership'
    ]

    for pattern in tech_patterns:
        for match in re.finditer(pattern, job_description.lower()):
            potential_skills.add(match.group(0))

    # Filter out very common words or short terms
    skill_stopwords = {'experience', 'year', 'work', 'team', 'use', 'using', 'with', 'and', 'the', 'our', 'we'}
    filtered_skills = {skill for skill in potential_skills
                       if len(skill) > 3 and not any(w in skill_stopwords for w in skill.split())}

    # Check each resume for the identified skills
    resume_skill_matches = []

    for i, resume_text in enumerate(resume_texts):
        resume_lower = resume_text.lower()
        matched_skills = []

        for skill in filtered_skills:
            if skill in resume_lower:
                matched_skills.append(skill)

        match_percentage = len(matched_skills) / len(filtered_skills) if filtered_skills else 0

        resume_skill_matches.append({
            'resume_index': i,
            'matched_skills': matched_skills,
            'match_percentage': match_percentage,
            'skill_count': len(matched_skills)
        })

    # Sort by match percentage
    resume_skill_matches.sort(key=lambda x: x['match_percentage'], reverse=True)

    return {
        'identified_skills': list(filtered_skills),
        'skill_matches': resume_skill_matches,
        'total_skills': len(filtered_skills)
    }

In [None]:
def enhanced_resume_ranking(job_description, resume_texts):
    """
    Provides enhanced ranking of resumes using sentence transformers for semantic similarity.

    Parameters:
    ----------
    job_description : str
        The job description text
    resume_texts : list
        List of texts extracted from resume files

    Returns:
    -------
    list
        List of dictionaries containing the enhanced similarity scores
    """
    # Load pre-trained sentence transformer model
    try:
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    except:
        # Fallback to standard ranking if model fails to load
        print("Advanced model couldn't be loaded, falling back to standard ranking")
        return rank_resumes(job_description, resume_texts)

    # Create embeddings for job description
    job_embedding = model.encode(job_description)

    # Create embeddings for resumes
    resume_embeddings = model.encode(resume_texts)

    # Calculate cosine similarities
    similarities = []
    for i, resume_embedding in enumerate(resume_embeddings):
        # Compute cosine similarity
        similarity = cosine_similarity(
            job_embedding.reshape(1, -1),
            resume_embedding.reshape(1, -1)
        )[0][0]

        similarities.append({
            'enhanced_similarity': similarity,
            'resume_text': resume_texts[i],
            'resume_index': i
        })

    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x['enhanced_similarity'], reverse=True)

    return similarities

In [None]:
# This is how the Streamlit UI would be implemented

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import io


def main():
    st.title("Resume Ranking Application")
    st.write("Upload a job description and candidate resumes to rank candidates based on job requirements.")

    # Job Description Input
    job_description = st.text_area("Enter Job Description:", height=200)

    # Resume Upload
    uploaded_files = st.file_uploader("Upload Resumes (PDF files):",
                                      type="pdf",
                                      accept_multiple_files=True)

    if st.button("Process Resumes"):
        if job_description and uploaded_files:
            # Validate job description and process resumes
            is_valid_job, error_msg, processed_results, valid_resumes, valid_names = process_resumes(
                job_description, uploaded_files
            )

            if not is_valid_job:
                st.error(f"Invalid job description: {error_msg}")
            elif not processed_results:
                st.error("No valid resumes found. Please check your uploaded files.")
            else:
                # Display results in a table
                results_df = pd.DataFrame(
                    [{
                        'File Name': r['file_name'],
                        'Similarity Score': f"{r['similarity_score']:.2f}"
                    } for r in processed_results]
                )

                st.write("### Ranking Results")
                st.dataframe(results_df)

                # Visualize top results
                top_n = 5 if len(processed_results) > 5 else len(processed_results)

                fig = px.bar(
                    results_df.head(top_n),
                    x='File Name',
                    y='Similarity Score',
                    title=f'Top {top_n} Resume Matches',
                    color='Similarity Score'
                )
                st.plotly_chart(fig)

                # Perform enhanced ranking
                enhanced_results = enhanced_resume_ranking(job_description, valid_resumes)

                # Create dataframe for enhanced results
                enhanced_df = pd.DataFrame(
                    [{
                        'File Name': valid_names[r['resume_index']],
                        'Enhanced Score': f"{r['enhanced_similarity']:.2f}"
                    } for r in enhanced_results]
                )

                st.write("### Enhanced Ranking Results")
                st.dataframe(enhanced_df)

                # Skills analysis
                st.write("### Key Skills Analysis")
                skill_analysis = analyze_key_skills(job_description, valid_resumes)

                st.write(f"**Identified Skills in Job Description:** {len(skill_analysis['identified_skills'])}")
                st.write(", ".join(skill_analysis['identified_skills']))

                # Display skill matches
                st.write("**Skill Matches by Resume:**")
                for match in skill_analysis['skill_matches']:
                    resume_name = valid_names[match['resume_index']]
                    st.write(f"{resume_name}: {match['skill_count']} skills matched " +
                             f"({match['match_percentage']:.1%})")

        else:
            if not job_description:
                st.error("Please enter a job description")
            if not uploaded_files:
                st.error("Please upload at least one resume")


if __name__ == "__main__":
    # Check and install packages
    check_and_install_packages()

    # Run the Streamlit app
    main()

In [None]:
# Running the Application

To run this resume ranking application:

1. Make sure all required packages are installed (run the installation cell above)
2. Save the complete code in a file named `app.py`
3. Run the application using the command:
   ```
   streamlit run app.py
   ```

## Example Usage:

1. Enter a job description in the text area
2. Upload multiple resume PDF files
3. Click "Process Resumes" to analyze and rank the candidates
4. View the ranking results, visualizations, and skills analysis
5. Use the information to identify the most promising candidates for further review

## Tips:

- For best results, use detailed job descriptions that specify required skills and qualifications
- Make sure resume PDFs are properly formatted and contain text that can be extracted
- The application works best with a reasonable number of resumes (5-20) to compare at once
- Both basic TF-IDF and enhanced semantic similarity rankings are provided for comparison