## Library imports

In [131]:
import spacy
from spacy.pipeline import EntityRuler
!pip install PyMuPDF
import fitz  # For PDF handling
!python -m spacy download en_core_web_md
!pip install docx
import os
!pip install tabulate
from tabulate import tabulate
import warnings
import json


Collecting en-core-web-md==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [135]:
!pip install python-docx
import docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [140]:
import warnings
warnings.filterwarnings('ignore')

## Loading and setting up the pipeline

In [None]:
# Load spaCy Model
nlp = spacy.load("en_core_web_md")
ruler = EntityRuler(nlp)

In [None]:
warnings.filterwarnings("ignore", message="[W036] The component 'entity_ruler' does not have any patterns defined.")

In [None]:
patterns = [
    {"label": "NAME", "pattern": [{"POS": "PROPN"}, {"POS": "PROPN"}]},  # Capture two-word names
    {"label": "PHONE", "pattern": [{"SHAPE": "dddd ddd dddd"}]},        # Indian phone format
    {"label": "EMAIL", "pattern": [{"LIKE_EMAIL": True}]},
    {"label": "LOCATION", "pattern": [{"ENT_TYPE": "GPE"}]},            # GPE = Geopolitical Entity

    # Education (more specific)
    {"label": "DEGREE", "pattern": [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"LOWER": "technology"}]},
    {"label": "UNIVERSITY", "pattern": [{"LOWER": "mahindra"}, {"LOWER": "university"}]},
    {"label": "CGPA", "pattern": [{"LOWER": "cgpa"}, {"TEXT": ":"}, {"IS_DIGIT": True}, {"TEXT": "/"}, {"IS_DIGIT": True}]},  # CGPA: 8/10 format
    {"label": "PERCENTAGE", "pattern": [{"TEXT": "Percentage"}, {"TEXT": ":"}, {"LIKE_NUM": True}, {"TEXT": "%"}]},

    # Projects
    {"label": "PROJECT_NAME", "pattern": [{"IS_TITLE": True}, {"POS": "PROPN"}]},  # Capture project names
    {"label": "PROJECT_MONTH_YEAR", "pattern": [{"TEXT": {"IN": ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]}}, {"SHAPE": "dddd"}]},

    # Skills (more comprehensive list)
    {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["python", "java", "c", "cpp", "matlab", "aws", "html", "css", "javascript", "nodejs", "nextjs", "reactjs", "expressjs", "prisma", "swift", "linux", "c++", "flask", "django", "faker", "pysynthetic", "mimesis", "json", "web", "tokens", "jwt", "parquet", "densenet-169", "arkit", "swift", "kepler's", "laws", "gravitation", "machine", "learning"]}}]},

    # Internship/Training Programs (adjust based on common company names)
    {"label": "COMPANY_NAME", "pattern": [{"LOWER": {"IN": ["dell", "cornell", "university", "national", "university", "of", "singapore"]}}]},

    # Awards and Achievements
    {"label": "AWARD", "pattern": [{"LOWER": "smart"}, {"LOWER": "india"}, {"LOWER": "hackathon"}]},  # Example specific award

    # Extracurricular Activities
    {"label": "INTEREST", "pattern": [{"LOWER": {"IN": ["football", "badminton", "e-sports", "movies", "music"]}}]},
    {"label": "SOFT_SKILL", "pattern": [{"LOWER": {"IN": ["leadership", "teamwork", "communication", "problem-solving", "critical thinking", "time management", "adaptability", "creativity"]}}]},
    {"label": "COURSE", "pattern": [{"LOWER": {"IN": ["operating", "systems", "data", "structures", "algorithms", "database", "management", "systems", "machine", "learning"]}}]},
]
ruler.add_patterns(patterns)
nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})


<spacy.pipeline.entityruler.EntityRuler at 0x7c9f5224a480>

## Data extraction and score calculation

### Data extraction methods

In [None]:
def extract_text_from_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == ".pdf":
        return extract_text_from_pdf(file_path)
    elif file_extension.lower() == ".docx":
        return extract_text_from_docx(file_path)
    elif file_extension.lower() == ".doc":
        return extract_text_from_doc(file_path)
    elif file_extension.lower() == ".txt":
        return extract_text_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format")

In [None]:
def extract_text_from_pdf(pdf_file_path):

    if os.path.splitext(pdf_file_path)[1].lower() == '.pdf':

      text = ""
      with fitz.open(pdf_file_path) as pdf_document:
          for page_num in range(len(pdf_document)):
              page = pdf_document.load_page(page_num)
              text += page.get_text()
      return text

In [None]:
def extract_text_from_docx(docx_file_path):
    doc = docx.Document(docx_file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

In [None]:
def extract_text_from_doc(doc_file_path):
    text = docx2txt.process(doc_file_path)
    return text

In [None]:
def extract_text_from_txt(txt_file_path):
    with open(txt_file_path, "r") as file:
        text = file.read()
    return text

### Methods

In [None]:
def lower_entities(entities_dict):
    lowercased_entities = {}
    for key, values in entities_dict.items():
        lowercased_entities[key] = [value.lower() for value in values]
    return lowercased_entities

In [None]:
def load_requirements(json_file_path):
    with open(json_file_path, "r") as file:
        return json.load(file)

In [136]:
def process_resumes(resumes_folder, requirements_path):
    results = {}
    # Load requirements from JSON file
    requirements = load_requirements(requirements_path)

    # Iterate through the files in the resumes folder
    for filename in os.listdir(resumes_folder):
        if filename.endswith('.pdf') or filename.endswith('.docx'):
            candidate_name = os.path.splitext(filename)[0]
            file_path = os.path.join(resumes_folder, filename)

            # Extract text based on file type
            if filename.endswith('.pdf'):
                pdf_text = extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                pdf_text = extract_text_from_docx(file_path)
            else:
                continue  # Skip processing for unsupported file types

            doc = nlp(pdf_text)

            # Extract entities from the NLP output
            extracted_entities = {}
            for ent in doc.ents:
                if ent.label_ not in extracted_entities:
                    extracted_entities[ent.label_] = []
                extracted_entities[ent.label_].append(ent.text)

            # Lowercase the extracted entities
            extracted_entities = lower_entities(extracted_entities)

            # Calculate score for the candidate
            score, matched, unmatched = calculate_score(requirements, extracted_entities)

            # Store the results
            results[candidate_name] = {
                "score": score,
                "matched": matched,
                "unmatched": unmatched
            }

    return results


In [153]:
def calculate_score(requirements, extracted_entities, required_score=20, additional_score=5):
    matched = {"required": [], "additional": []}
    unmatched = {"required": [], "additional": []}

    # Track the count of satisfied required and additional skills
    satisfied_required_count = 0
    satisfied_additional_count = 0

    # Calculate the original number of required and additional skills
    original_required_count = len(requirements["required"])
    original_additional_count = len(requirements["additional"])

    # Calculate the total possible score for required and additional skills
    total_possible_required_score = original_required_count * required_score
    total_possible_additional_score = original_additional_count * additional_score

    for requirement in requirements["required"]:
        found = False
        for key, values in extracted_entities.items():
            if requirement in values and requirement not in matched["required"]:
                found = True
                matched["required"].append(requirement)
                satisfied_required_count += 1
                break
        if not found:
            unmatched["required"].append(requirement)

    for additional in requirements["additional"]:
        found = False
        for key, values in extracted_entities.items():
            if additional in values and additional not in matched["additional"]:
                found = True
                matched["additional"].append(additional)
                satisfied_additional_count += 1
                break
        if not found:
            unmatched["additional"].append(additional)

    actual_required_score = satisfied_required_count * required_score
    actual_additional_score = satisfied_additional_count * additional_score

    # Check if the percentage of satisfied required skills is less than 85% of the original required count
    if satisfied_required_count < 0.10 * original_required_count:
        final_score = 0
    else:
        total_possible_score_achieved = actual_required_score + actual_additional_score
        final_score = round((total_possible_score_achieved / (total_possible_required_score + total_possible_additional_score)) * 100,2)

    return final_score, matched, unmatched


In [None]:
def generate_candidate_details(results):
    candidate_details = []

    for candidate_name, result in results.items():
        score = result["score"]
        satisfied_required_skills = result["matched"]["required"]
        satisfied_additional_skills = result["matched"]["additional"]
        not_satisfied_required_skills = result["unmatched"]["required"]
        not_satisfied_additional_skills = result["unmatched"]["additional"]

        # Append candidate details to the list as a tuple
        candidate_details.append((candidate_name, score, satisfied_required_skills, satisfied_additional_skills, not_satisfied_required_skills, not_satisfied_additional_skills))

    # Sort the list of tuples based on score in descending order
    candidate_details.sort(key=lambda x: x[1], reverse=True)

    table_headers = ["Candidate Name", "Score", "Satisfied Required Skills", "Satisfied Additional Skills", "Not Satisfied Required Skills", "Not Satisfied Additional Skills"]
    table_data = []

    for candidate_detail in candidate_details:
        candidate_name, score, satisfied_required_skills, satisfied_additional_skills, not_satisfied_required_skills, not_satisfied_additional_skills = candidate_detail

        table_row = [
            candidate_name,
            score,
            ", ".join(satisfied_required_skills),
            ", ".join(satisfied_additional_skills),
            ", ".join(not_satisfied_required_skills),
            ", ".join(not_satisfied_additional_skills)
        ]
        table_data.append(table_row)

    # Print candidate details in a pretty table format
    print(tabulate(table_data, headers=table_headers, tablefmt="pretty"))

    # Export details to a text file
    with open("candidate_details.txt", "w") as file:
        file.write(tabulate(table_data, headers=table_headers, tablefmt="plain"))


## Main

In [154]:
resumes_folder = "/content/resumes"
# requirements_path = "/content/requirements_python.json"
requirements_path = "/content/requirements_web.json"

results = process_resumes(resumes_folder,requirements_path)
generate_candidate_details(results)

+-------------------------------+-------+-------------------------------------+-----------------------------+--------------------------------------------------+---------------------------------+
|        Candidate Name         | Score |      Satisfied Required Skills      | Satisfied Additional Skills |          Not Satisfied Required Skills           | Not Satisfied Additional Skills |
+-------------------------------+-------+-------------------------------------+-----------------------------+--------------------------------------------------+---------------------------------+
|        MV Sai Gowtham         | 62.5  | javascript, sql, node, express, css |                             |                   react, html                    |   word, excel, github, docker   |
|         Basava Laxmi          | 53.12 |     javascript, sql, node, css      |           github            |               react, express, html               |       word, excel, docker       |
|         Adepu Ashvith  