In [2]:
%%capture
# ---------------------------
# IMPORTS AND SETUP
# ---------------------------
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import json
import spacy
from functools import lru_cache
import torch

sentence_model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)

# Set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

current_wd = os.getcwd()

# Load job descriptions and resumes
job_desc_data = pd.read_csv(os.path.join(current_wd, "data", "job_descriptions_latest.csv"))
resume_data = pd.read_csv(os.path.join(current_wd, "data", "resume_extraction_samples_3.1.25.xlsx - Sheet1.csv"))

# Convert JSON strings to dicts
job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(json.loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(json.loads)

# Select JSON columns
job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence_model.to(device)

In [7]:
def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

# --- Precompute Embeddings for a DataFrame Column ---
def precompute_embeddings_for_df(df, extracted_col='extracted', batch_size=64):
    """
    Iterates over all records in the DataFrame, extracts all strings from the JSON in the
    specified 'extracted' column, computes their embeddings in batch, and returns a dictionary
    mapping each unique string to its embedding.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing the job descriptions.
        extracted_col (str): Name of the column that contains the JSON string.
        batch_size (int): Batch size for encoding.
    
    Returns:
        dict: Mapping from string to its embedding.
    """
    all_strings = set()
    
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        json_str = row[extracted_col]
        try:
            # Parse the JSON string into a Python object
            j = json.loads(json_str)
            # Extract all non-empty strings from the JSON
            strings = extract_all_strings_from_json(j)
            all_strings.update(strings)
        except Exception as e:
            print(f"Error parsing JSON in row {idx}: {e}")
    
    all_strings = list(all_strings)
    if all_strings:
        # Compute embeddings for all strings in batches
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    # Create a dictionary mapping each string to its embedding
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb


embeddings = precompute_embeddings_for_df(job_desc_data, extracted_col='extracted', batch_size=64)
# resume_embeddings = precompute_embeddings_for_df(resume_data, extracted_col='extracted', batch_size=64)
# embeddings = {**job_embeddings, **resume_embeddings}

In [19]:
# ---------------------------
# NLP SIMILARITY FUNCTIONS
# ---------------------------
# Global dictionary for caching embeddings
embedding_cache = {}

def get_embedding(text: str):
    """
    Returns the embedding for the given text.
    If the embedding is not already present in the global 'embedding_cache' dictionary,
    it encodes the text, stores it in 'embedding_cache', and returns it.
    """
    text = text.strip()
    if text in embedding_cache:
        return embedding_cache[text]
    else:
        emb = sentence_model.encode(text, convert_to_tensor=True)
        embedding_cache[text] = emb
        return emb

def cosine_similarity(vec1, vec2):
    return util.cos_sim(vec1, vec2)

def nlp_similarity_cached(text1, text2):
    """
    Returns the cosine similarity between the embeddings of text1 and text2,
    using cached embeddings to avoid recomputation.
    """
    emb1 = get_embedding(text1)
    emb2 = get_embedding(text2)
    return cosine_similarity(emb1, emb2)

# ---------------------------
# SECTION 1: SKILL MATCHING
# ---------------------------
def extract_job_mandatory_skills(job_json):
    """
    Extracts the mandatory hard skills from the job description.
    """
    return job_json.get("mandatory", {}).get("hard_skills", [])

def extract_job_preferred_skills(job_json):
    """
    Extracts the preferred hard skills from the job description.
    """
    return job_json.get("preferred", {}).get("hard_skills", [])

def extract_resume_skills(resume_json):
    """
    Extracts the candidate's skills from the resume.
    """
    return resume_json.get("skills", [])

def compute_required_skill_similarity(candidate_skill, job_required_skill):
    """
    Given a candidate skill (which may be a string, a list of strings, or a nested list)
    and a job-required skill value (a nested list of terms), compute a similarity score.
    
    Logic:
      - Normalize candidate_skill into candidate groups. If candidate_skill is a list of lists,
        each inner list is considered an alternative (OR condition).
      - Normalize job_required_skill into job requirement groups. If it's a flat list,
        wrap it into one group.
      - For each job requirement group, loop over each candidate group and, for each candidate term,
        compute its similarity to every term in the required group; average these similarities per candidate term.
      - Then average across the candidate terms for the group.
      - If any candidate group yields a perfect average (1.0), return 1.0 immediately.
      - Otherwise, return the maximum average similarity among all job-required groups.
    
    Update 3/7/2025: Added detailed print statements and early exits on perfect matches.
    """
    # Normalize candidate_skill into candidate groups
    if isinstance(candidate_skill, str):
        candidate_groups = [[candidate_skill]]
    elif isinstance(candidate_skill, list):
        if candidate_skill and isinstance(candidate_skill[0], list):
            candidate_groups = candidate_skill
        else:
            candidate_groups = [candidate_skill]
    else:
        candidate_groups = []

    # Normalize job_required_skill into job requirement groups
    if isinstance(job_required_skill, list) and job_required_skill:
        if not isinstance(job_required_skill[0], list):
            job_required_groups = [job_required_skill]
        else:
            job_required_groups = job_required_skill
    else:
        job_required_groups = []

    if not candidate_groups or not job_required_groups:
        return 0.0

    best_overall = 0.0

    # Iterate over each job requirement group
    for req_group in job_required_groups:
        print(f"\nProcessing Job Requirement Group: {req_group}")
        best_for_req = 0.0
        best_candidate_group = None
        for cand_group in candidate_groups:
            print(f"  Evaluating Candidate Group: {cand_group}")
            candidate_term_avgs = []
            for cand_term in cand_group:
                sims = []
                for req_term in req_group:
                    sim = nlp_similarity_cached(cand_term, req_term).item()
                    print(f"    Candidate term '{cand_term}' vs. Required term '{req_term}': similarity = {sim}")
                    sims.append(sim)
                if sims:
                    avg_sim = sum(sims) / len(sims)
                    print(f"    => Average similarity for candidate term '{cand_term}': {avg_sim}")
                    candidate_term_avgs.append(avg_sim)
            if candidate_term_avgs:
                group_avg = sum(candidate_term_avgs) / len(candidate_term_avgs)
                print(f"  => Candidate Group {cand_group} average similarity: {group_avg}")
                if group_avg == 1.0:
                    print("  => Perfect match found for this candidate group. Exiting job requirement processing early.\n")
                    return 1.0
                if group_avg > best_for_req:
                    best_for_req = group_avg
                    best_candidate_group = cand_group
        print(f"=> Best match for required group {req_group} is candidate group {best_candidate_group} with average similarity {best_for_req}")
        if best_for_req == 1.0:
            print("=> Perfect match found for this job requirement. Stopping further processing for this skill.\n")
            return 1.0
        if best_for_req > best_overall:
            best_overall = best_for_req

    print(f"\n   --> Best overall similarity among job-required groups: {best_overall}")
    return best_overall

def calculate_skill_match_score(job_json, resume_json, skill_type='mandatory'):
    """
    Calculates an overall match score for skills.
    
    For each job-required skill requirement:
      - Print the requirement and minimum years.
      - For each candidate skill (that meets the minimum years), compute the similarity
        using compute_required_skill_similarity.
      - If a candidate skill returns a perfect match (1.0), break early for that requirement.
      - Append the best similarity for each requirement and average them.
    
    Update 3/7/2025: Added an early exit if a perfect match is found.
    """
    if skill_type == 'mandatory':
        job_skills = extract_job_mandatory_skills(job_json)
    else:
        job_skills = extract_job_preferred_skills(job_json)
        
    resume_skills = extract_resume_skills(resume_json)
    requirement_scores = []
    
    for req in job_skills:
        job_required_skill = req.get("skill", [])
        min_years_required = req.get("minyears", [0])[0]
        
        print("----------------------------------------------------")
        print("Processing Job Skill Requirement:")
        print(f"  Job Requirement: {job_required_skill}")
        print(f"  Minimum Years Required: {min_years_required}")
        
        best_match = 0.0
        best_candidate_skill = None
        perfect_found = False
        
        for candidate in resume_skills:
            candidate_years = candidate.get("years", 0)
            if candidate_years >= min_years_required:
                for candidate_skill in candidate.get("skill", []):
                    sim = compute_required_skill_similarity(candidate_skill, job_required_skill)
                    print(f"   Candidate Skill: {candidate_skill} (Years: {candidate_years}) -> Similarity: {sim}")
                    if sim > best_match:
                        best_match = sim
                        best_candidate_skill = candidate_skill
                    if sim == 1.0:
                        print("=> Perfect match found for this candidate skill. Skipping remaining candidate skills for this requirement.\n")
                        perfect_found = True
                        break
                if perfect_found:
                    break
        print(f"  Best match for requirement {job_required_skill}: {best_match} from candidate skill: {best_candidate_skill}")
        requirement_scores.append(best_match)
        if best_match == 1.0:
            print("=> Perfect match achieved for this job requirement; moving on to the next requirement.\n")
            continue
    
    overall_skill_score = sum(requirement_scores) / len(requirement_scores) if requirement_scores else 0.0
    print("----------------------------------------------------")
    print(f"Overall Skill Match Score for '{skill_type}': {overall_skill_score}")
    return overall_skill_score

def calculate_overall_skill_match_score(job_json, resume_json, mandatory_weight=0.5, preferred_weight=0.5):
    """
    Calculates the overall skills match score as a weighted average of mandatory and preferred skills match scores.
    
    Parameters:
      - job_json: JSON object representing the job description.
      - resume_json: JSON object representing the candidate's resume.
      - mandatory_weight: Weight for the mandatory skills score.
      - preferred_weight: Weight for the preferred skills score.
      
    Returns:
      - overall_skill_score (float): The weighted overall skills match score.
    
    Update 3/7/2025: Added weighted average calculation.
    """
    mand_score = calculate_skill_match_score(job_json, resume_json, skill_type='mandatory')
    pref_score = calculate_skill_match_score(job_json, resume_json, skill_type='preferred')
    
    if mand_score is None and pref_score is None:
        overall_skill_score = 0.0
    elif mand_score is None:
        overall_skill_score = pref_score
    elif pref_score is None:
        overall_skill_score = mand_score
    else:
        overall_skill_score = (mand_score * mandatory_weight + pref_score * preferred_weight) / (mandatory_weight + preferred_weight)
    
    print(f"Overall Skill Match Score (Weighted): {overall_skill_score}")
    return overall_skill_score

# ---------------------------
# SECTION 2: RESPONSIBILITIES MATCHING
# ---------------------------
def extract_job_responsibilities_hard_skills(job_json):
    """
    Extracts the responsibilities specified in the job description.
    Expected format: { "responsibility": { "hard_skills": [ { "skill": [...] }, ... ] } }
    Update 3/7/2025: Extracts the list under "responsibility" -> "hard_skills".
    """
    return job_json.get("responsibility", {}).get("hard_skills", [])

def calculate_responsibilities_match_score(job_json, resume_json):
    """
    Calculates an overall match score for job responsibilities.
    Mirrors the skills matching logic.
    """
    job_responsibilities = extract_job_responsibilities_hard_skills(job_json)
    print("========== Job Responsibilities ==========")
    print(json.dumps(job_responsibilities, indent=4, ensure_ascii=False))
    
    candidate_responsibilities = extract_resume_skills(resume_json)
    print("========== Candidate Responsibilities ==========")
    print(json.dumps(candidate_responsibilities, indent=4, ensure_ascii=False))
    
    if not job_responsibilities:
        print("=> No responsibilities specified in job description.\n")
        return None
    
    responsibility_scores = []
    
    for resp in job_responsibilities:
        required_resp = resp.get("skill", [])
        print("\n--- Checking Job Responsibility ---")
        print("Required Responsibility:")
        print(json.dumps(required_resp, indent=4, ensure_ascii=False))
        best_sim = 0.0
        best_candidate_resp = None
        for candidate in candidate_responsibilities:
            candidate_resp = candidate.get("skill", [])
            sim = compute_required_skill_similarity(candidate_resp, required_resp)
            print(f"Candidate Responsibility: {candidate_resp} -> Similarity: {sim}")
            if sim > best_sim:
                best_sim = sim
                best_candidate_resp = candidate_resp
        print(f"=> Best match for responsibility: {best_candidate_resp} with score: {best_sim}\n")
        responsibility_scores.append(best_sim)
    
    overall_responsibility_score = sum(responsibility_scores) / len(responsibility_scores) if responsibility_scores else 0.0
    print(f"\n========== Overall Responsibilities Match Score: {overall_responsibility_score} ==========\n")
    return overall_responsibility_score

# ---------------------------
# SECTION 3: EDUCATION MATCHING
# ---------------------------
EDU_RANK = {
    "High School Diploma": 1,
    "Vocational": 1,  # or adjust rank if desired
    "Associate's": 2,
    "Current Bachelor's Student": 3,
    "Some Bachelor's": 3,
    "Bachelor’s": 4,
    "Bachelor's": 4,
    "Some Master's": 5,
    "Current Master's Student": 5,
    "Master’s": 6,
    "Master's": 6,
    "PhD": 7,
    "Postdoctoral": 8
}

def extract_job_education_requirements(job_json):
    """
    Extracts the mandatory and preferred education requirements.
    """
    mandatory_edu = job_json.get("mandatory", {}).get("education", [])
    preferred_edu = job_json.get("preferred", {}).get("education", [])
    return mandatory_edu, preferred_edu

def extract_resume_education(resume_json):
    return resume_json.get("education", [])

def extract_professional_background(resume_json):
    return resume_json.get("professional_background", [])

def candidate_has_education_level(resume_education, required_rank):
    """Checks if the candidate meets the required education level."""
    for edu in resume_education:
        level = edu.get("education_level", "")
        level_rank = EDU_RANK.get(level, 0)
        print(f"[Level Check] Candidate Education: '{level}' (Rank: {level_rank}) vs Required Rank: {required_rank}")
        if level_rank >= required_rank:
            print("=> Candidate meets the education level requirement.\n")
            return True
    print("=> Candidate does NOT meet the education level requirement.\n")
    return False

def get_required_field_score(resume_education, resume_experience, required_fields, must_have_formal, required_rank, threshold=0.6, min_years=4, ignore_threshold=False):
    """
    Computes an average similarity score for matching the candidate’s formal education
    against the required fields of study.
    """
    use_lower_threshold = any(field.lower() == "related" for field in required_fields)
    filtered_required_fields = [field for field in required_fields if field.lower() != "related"]
    
    print(f"\n========== Formal Education Matching ==========")
    print(f"Required Fields: {json.dumps(required_fields, indent=4, ensure_ascii=False)}")
    scores = []
    for edu in resume_education:
        level = edu.get("education_level", "")
        level_rank = EDU_RANK.get(level, 0)
        print(f"Candidate Education: Level='{level}', Majors={json.dumps(edu.get('major', []), indent=4, ensure_ascii=False)}")
        if level_rank >= required_rank:
            for candidate_major in edu.get("major", []):
                for req_field in filtered_required_fields:
                    sim_score = nlp_similarity_cached(candidate_major, req_field)
                    print(f"[Formal] Comparing Candidate Major '{candidate_major}' with Required Field '{req_field}' => {sim_score.item()}")
                    if not ignore_threshold:
                        effective_threshold = threshold if use_lower_threshold else 0.95
                        if sim_score >= effective_threshold:
                            scores.append(sim_score.item())
                    else:
                        scores.append(sim_score.item())
    if scores:
        avg_score = sum(scores) / len(scores)
        print(f"=> Average Similarity Score from Formal Education: {avg_score}\n")
        return avg_score
    else:
        print("=> No formal education match found; using experience fallback...\n")
        exp_score = get_equivalent_experience_score(resume_experience, required_fields, threshold=threshold, min_years=min_years)
        return exp_score

def get_equivalent_experience_score(resume_experience, related_fields, threshold=0.6, min_years=4):
    """
    Computes a weighted average similarity score for experience-based matches, 
    comparing against 'related_fields_of_study'.
    """
    print("\n========== Experience Matching ==========")
    filtered_fields = [field for field in related_fields if field.lower() != "related"]
    
    total_years = 0.0
    weighted_sum = 0.0
    for exp in resume_experience:
        candidate_fields = exp.get("related_fields_of_study", [])
        job_titles = exp.get("background", [])
        job_title_str = ", ".join(job_titles) if job_titles else "Unknown"
        years = exp.get("years", 0)
        max_sim = 0.0
        for candidate_field in candidate_fields:
            for req_field in filtered_fields:
                sim_score = nlp_similarity_cached(candidate_field, req_field)
                print(f"[Experience] Job: '{job_title_str}' | Candidate Field: '{candidate_field}' vs Required: '{req_field}' => {sim_score.item()}")
                if sim_score.item() > max_sim:
                    max_sim = sim_score.item()
        if max_sim >= threshold:
            weighted_sum += max_sim * years
            total_years += years
            print(f"=> Using max similarity {max_sim} (Years: {years}) | Contribution: {max_sim * years}")
            print(f"=> Total Relevant Years so far: {total_years}\n")
    if total_years >= min_years and total_years > 0:
        avg_exp = weighted_sum / total_years
        print(f"=> Total Relevant Experience: {total_years} (Min Required: {min_years})")
        print(f"=> Weighted Average Similarity Score from Experience: {avg_exp}\n")
        return avg_exp
    else:
        print(f"=> Total Relevant Experience: {total_years} (Min Required: {min_years}) -- Not enough experience.\n")
        return 0.0

def meets_education_requirement(requirement, resume_education, resume_experience, threshold=0.7, min_years=4, allow_fallback=False, job_json=None, resume_json=None):
    """
    Computes a composite score for a single education requirement.
    If multiple education levels are specified, the final score is the average of each level's score.
    If any required level contains 'credential', check credentials.
    
    Update 3/7/2025: If a required level is "Or Experience" and is not in EDU_RANK, skip it.
    """
    print("\n========== Checking Single Education Requirement ==========")
    print("Job Requirement:")
    print(json.dumps(requirement, indent=4, ensure_ascii=False))
    
    req_fields = requirement.get("field_of_study", [])
    req_levels = requirement.get("education_level", [])
    
    must_have_formal = True
    for lvl in req_levels:
        if "or experience" in lvl.lower():
            must_have_formal = False
            print("=> Job accepts equivalent experience in lieu of formal education.\n")
            break
    if allow_fallback:
        must_have_formal = False
    
    max_required_rank = 0
    for lvl in req_levels:
        if lvl.lower().strip() == "or experience" and EDU_RANK.get(lvl, 0) == 0:
            print(f"=> Skipping education level '{lvl}' because it is 'Or Experience' and not ranked.\n")
            continue
        lvl_rank = EDU_RANK.get(lvl, 0)
        if lvl_rank > max_required_rank:
            max_required_rank = lvl_rank
            print(f"=> Updated Required Education Rank to: {max_required_rank} based on level '{lvl}'\n")
    
    level_scores = []
    
    # Check for credential requirement.
    credential_req = any("credential" in lvl.lower() for lvl in req_levels)
    if credential_req:
        if job_json is not None and resume_json is not None:
            print("=> Credential requirement detected. Checking candidate credentials...\n")
            cred_score = calculate_overall_credentials_score(job_json, resume_json)
            print(f"=> Credential Score: {cred_score}\n")
            level_scores.append(cred_score)
        else:
            print("=> Credential requirement detected but job/resume JSON not provided for credentials matching.\n")
            level_scores.append(0)
    
    if req_fields:
        print(f"=> Required Field(s) of Study: {json.dumps(req_fields, indent=4, ensure_ascii=False)}\n")
        for lvl in req_levels:
            if lvl.lower().strip() == "or experience" and EDU_RANK.get(lvl, 0) == 0:
                continue
            req_rank = EDU_RANK.get(lvl, 0)
            if must_have_formal:
                formal_score = get_required_field_score(resume_education, resume_experience, req_fields, must_have_formal, req_rank, threshold=threshold, min_years=min_years, ignore_threshold=False)
            else:
                formal_score = get_required_field_score(resume_education, resume_experience, req_fields, must_have_formal, req_rank, threshold=threshold, min_years=min_years, ignore_threshold=True)
                exp_score = get_equivalent_experience_score(resume_experience, req_fields, threshold=threshold, min_years=min_years)
                print(f"=> Formal Education Score: {formal_score}")
                print(f"=> Experience Score: {exp_score}\n")
                if formal_score > 0 and exp_score > 0:
                    formal_score = (formal_score + exp_score) / 2
                else:
                    formal_score = formal_score or exp_score
            print(f"=> Score for level '{lvl}': {formal_score}\n")
            level_scores.append(formal_score)
    else:
        if must_have_formal:
            level_scores.append(1.0 if candidate_has_education_level(resume_education, max_required_rank) else 0.0)
        else:
            level_scores.append(1.0 if candidate_has_education_level(resume_education, max_required_rank) 
                                else get_equivalent_experience_score(resume_experience, ["Any"], threshold=threshold, min_years=min_years))
    overall_edu_score = sum(level_scores) / len(level_scores) if level_scores else 0.0
    print(f"=> Final Composite Score for Requirement: {overall_edu_score}\n")
    return overall_edu_score

def calculate_mandatory_education_score(job_json, resume_json, threshold=0.7, min_years=4):
    mandatory_requirements, _ = extract_job_education_requirements(job_json)
    resume_edu = extract_resume_education(resume_json)
    resume_exp = extract_professional_background(resume_json)
    print("\n========== Mandatory Education Requirements ==========")
    print(json.dumps(mandatory_requirements, indent=4, ensure_ascii=False))
    print("\n========== Candidate Education ==========")
    print(json.dumps(resume_edu, indent=4, ensure_ascii=False))
    if not mandatory_requirements:
        print("=> No mandatory education requirements specified.\n")
        return None
    mandatory_scores = []
    for req in mandatory_requirements:
        print("\n--- Checking Mandatory Requirement ---")
        score = meets_education_requirement(req, resume_edu, resume_exp, threshold=threshold, min_years=min_years, job_json=job_json, resume_json=resume_json)
        if score == 0:
            print("!!! Mandatory education requirement NOT met. Final Score: 0.0 !!!\n")
            return 0.0
        print(f"=> Mandatory Requirement Score: {score}\n")
        mandatory_scores.append(score)
    mandatory_avg = sum(mandatory_scores) / len(mandatory_scores)
    return mandatory_avg

def calculate_preferred_education_score(job_json, resume_json, threshold=0.7, min_years=4):
    _, preferred_requirements = extract_job_education_requirements(job_json)
    resume_edu = extract_resume_education(resume_json)
    resume_exp = extract_professional_background(resume_json)
    print("\n========== Preferred Education Requirements ==========")
    print(json.dumps(preferred_requirements, indent=4, ensure_ascii=False))
    print("\n========== Candidate Education ==========")
    print(json.dumps(resume_edu, indent=4, ensure_ascii=False))
    if not preferred_requirements:
        print("=> No preferred education requirements specified.\n")
        return None
    preferred_scores = []
    for req in preferred_requirements:
        print("\n--- Checking Preferred Requirement ---")
        print("Requirement:")
        print(json.dumps(req, indent=4, ensure_ascii=False))
        score = meets_education_requirement(req, resume_edu, resume_exp, threshold=threshold, min_years=min_years, allow_fallback=True, job_json=job_json, resume_json=resume_json)
        print(f"=> Preferred Requirement Score: {score}\n")
        preferred_scores.append(score)
    preferred_avg = sum(preferred_scores) / len(preferred_scores)
    return preferred_avg

def calculate_education_match_score(job_json, resume_json, threshold=0.7, min_years=4, mandatory_weight=0.5, preferred_weight=0.5):
    """
    Calculates the overall education match score with weighted mandatory and preferred scores.
    
    Parameters:
      - job_json: JSON object of the job description.
      - resume_json: JSON object of the candidate's resume.
      - threshold: Similarity threshold.
      - min_years: Minimum years required.
      - mandatory_weight: Weight for the mandatory education score.
      - preferred_weight: Weight for the preferred education score.
      
    Returns:
      - overall_education_score (float): The weighted overall education match score.
    """
    print("========== Starting Education Match Score Calculation ==========\n")
    mand_score = calculate_mandatory_education_score(job_json, resume_json, threshold, min_years)
    pref_score = calculate_preferred_education_score(job_json, resume_json, threshold, min_years)
    if mand_score is not None and pref_score is not None:
        overall_education_score = (mand_score * mandatory_weight + pref_score * preferred_weight) / (mandatory_weight + preferred_weight)
    elif mand_score is not None:
        overall_education_score = mand_score
    elif pref_score is not None:
        overall_education_score = pref_score
    else:
        overall_education_score = 0
    print(f"\n========== Overall Education Match Score: {overall_education_score} ==========\n")
    return overall_education_score

# ---------------------------
# SECTION 4: CREDENTIALS MATCHING
# ---------------------------
def extract_job_credentials(job_json):
    """
    Extracts the credentials specified in the job description.
    Returns a tuple: (mandatory_credentials, preferred_credentials)
    """
    mandatory_creds = job_json.get("mandatory", {}).get("credentials", [])
    preferred_creds = job_json.get("preferred", {}).get("credentials", [])
    print("Extracted Job Credentials:")
    print(f"  Mandatory: {mandatory_creds}")
    print(f"  Preferred: {preferred_creds}\n")
    return mandatory_creds, preferred_creds

def extract_resume_credentials(resume_json):
    """
    Extracts the candidate's credentials from the resume.
    """
    creds = resume_json.get("credentials", [])
    print("Extracted Resume Credentials:")
    print(f"  {creds}\n")
    return creds

def match_credentials(required_creds, resume_creds):
    """
    Computes a match score for a list of required credentials using NLP similarity.
    """
    if not required_creds:
        return None

    req_scores = []
    for req_cred_obj in required_creds:
        for req_cred in req_cred_obj.get("credential", []):
            best_sim = 0.0
            for cred_obj in resume_creds:
                candidate_creds = cred_obj.get("credential", [])
                for cand_cred in candidate_creds:
                    sim = nlp_similarity_cached(req_cred, cand_cred).item()
                    if sim > best_sim:
                        best_sim = sim
            req_scores.append(best_sim)
            print(f"Best similarity for required credential '{req_cred}': {best_sim}")
    overall_cred_score = sum(req_scores) / len(req_scores) if req_scores else 0.0
    print(f"Average similarity score for these credentials: {overall_cred_score}\n")
    return overall_cred_score

def calculate_mandatory_credentials_score(job_json, resume_json):
    """
    Calculates the credentials match score for mandatory requirements.
    """
    job_mandatory, _ = extract_job_credentials(job_json)
    resume_creds = extract_resume_credentials(resume_json)
    if not job_mandatory:
        print("No mandatory credentials specified.\n")
        return None
    score = match_credentials(job_mandatory, resume_creds)
    print(f"Mandatory Credentials Score: {score}\n")
    return score

def calculate_preferred_credentials_score(job_json, resume_json):
    """
    Calculates the credentials match score for preferred requirements.
    """
    _, job_preferred = extract_job_credentials(job_json)
    resume_creds = extract_resume_credentials(resume_json)
    if not job_preferred:
        print("No preferred credentials specified.\n")
        return None
    score = match_credentials(job_preferred, resume_creds)
    print(f"Preferred Credentials Score: {score}\n")
    return score

def calculate_overall_credentials_score(job_json, resume_json, mandatory_weight=0.5, preferred_weight=0.5):
    """
    Combines the mandatory and preferred credentials match scores into an overall score,
    applying the provided weights.
    
    Parameters:
      - job_json: JSON object for the job description.
      - resume_json: JSON object for the candidate's resume.
      - mandatory_weight: Weight for the mandatory credentials score.
      - preferred_weight: Weight for the preferred credentials score.
      
    Returns:
      - overall_cred_score (float): The weighted overall credentials match score.
    
    Update 3/7/2025: Added parameters for mandatory and preferred weights.
    """
    mand_score = calculate_mandatory_credentials_score(job_json, resume_json)
    pref_score = calculate_preferred_credentials_score(job_json, resume_json)
    
    if mand_score is None and pref_score is None:
        overall_cred_score = 1.0
    elif mand_score is not None and pref_score is not None:
        overall_cred_score = (mand_score * mandatory_weight + pref_score * preferred_weight) / (mandatory_weight + preferred_weight)
    elif mand_score is not None:
        overall_cred_score = mand_score
    else:
        overall_cred_score = pref_score
        
    print(f"Overall Credentials Match Score: {overall_cred_score}\n")
    return overall_cred_score

# ---------------------------
# END CREDENTIALS MATCHING
# ---------------------------

# ---------------------------
# Example Overall Functions
# ---------------------------
def calculate_overall_skill_match_score(job_json, resume_json, mandatory_weight=0.5, preferred_weight=0.5):
    """
    Calculates the overall skills match score as a weighted average of mandatory and preferred skills match scores.
    
    Parameters:
      - job_json: JSON object representing the job description.
      - resume_json: JSON object representing the candidate's resume.
      - mandatory_weight: Weight for the mandatory skills score.
      - preferred_weight: Weight for the preferred skills score.
      
    Returns:
      - overall_skill_score (float): The weighted overall skills match score.
    """
    mand_score = calculate_skill_match_score(job_json, resume_json, skill_type='mandatory')
    pref_score = calculate_skill_match_score(job_json, resume_json, skill_type='preferred')
    
    if mand_score is None and pref_score is None:
        overall_skill_score = 0.0
    elif mand_score is None:
        overall_skill_score = pref_score
    elif pref_score is None:
        overall_skill_score = mand_score
    else:
        overall_skill_score = (mand_score * mandatory_weight + pref_score * preferred_weight) / (mandatory_weight + preferred_weight)
    
    print(f"Overall Skill Match Score (Weighted): {overall_skill_score}")
    return overall_skill_score

def calculate_overall_education_match_score(job_json, resume_json, threshold=0.7, min_years=4, mandatory_weight=0.5, preferred_weight=0.5):
    """
    Calculates the overall education match score with weighted mandatory and preferred scores.
    
    Parameters:
      - job_json: JSON object of the job description.
      - resume_json: JSON object of the candidate's resume.
      - threshold: Similarity threshold.
      - min_years: Minimum years required.
      - mandatory_weight: Weight for the mandatory education score.
      - preferred_weight: Weight for the preferred education score.
      
    Returns:
      - overall_education_score (float): The weighted overall education match score.
    """
    print("========== Starting Education Match Score Calculation ==========\n")
    mand_score = calculate_mandatory_education_score(job_json, resume_json, threshold, min_years)
    pref_score = calculate_preferred_education_score(job_json, resume_json, threshold, min_years)
    if mand_score is not None and pref_score is not None:
        overall_education_score = (mand_score * mandatory_weight + pref_score * preferred_weight) / (mandatory_weight + preferred_weight)
    elif mand_score is not None:
        overall_education_score = mand_score
    elif pref_score is not None:
        overall_education_score = pref_score
    else:
        overall_education_score = 0
    print(f"\n========== Overall Education Match Score: {overall_education_score} ==========\n")
    return overall_education_score

def calculate_overall_responsibilities_match_score(job_json, resume_json):
    """
    Calculates the overall match score for responsibilities.
    Mirrors the skills matching logic.
    """
    job_responsibilities = extract_job_responsibilities_hard_skills(job_json)
    print("========== Job Responsibilities ==========")
    print(json.dumps(job_responsibilities, indent=4, ensure_ascii=False))
    
    candidate_responsibilities = extract_resume_skills(resume_json)
    print("========== Candidate Responsibilities ==========")
    print(json.dumps(candidate_responsibilities, indent=4, ensure_ascii=False))
    
    if not job_responsibilities:
        print("=> No responsibilities specified in job description.\n")
        return None
    
    resp_scores = []
    for resp in job_responsibilities:
        required_resp = resp.get("skill", [])
        print("\n--- Checking Job Responsibility ---")
        print("Required Responsibility:")
        print(json.dumps(required_resp, indent=4, ensure_ascii=False))
        best_sim = 0.0
        best_candidate_resp = None
        for candidate in candidate_responsibilities:
            candidate_resp = candidate.get("skill", [])
            sim = compute_required_skill_similarity(candidate_resp, required_resp)
            print(f"Candidate Responsibility: {candidate_resp} -> Similarity: {sim}")
            if sim > best_sim:
                best_sim = sim
                best_candidate_resp = candidate_resp
        print(f"=> Best match for responsibility: {best_candidate_resp} with score: {best_sim}\n")
        resp_scores.append(best_sim)
    
    overall_resp_score = sum(resp_scores) / len(resp_scores) if resp_scores else 0.0
    print(f"\n========== Overall Responsibilities Match Score: {overall_resp_score} ==========\n")
    return overall_resp_score


def calculate_overall_match_score(job_json, resume_json,
                                  skill_weight=0.25,
                                  education_weight=0.25,
                                  responsibilities_weight=0.25,
                                  credentials_weight=0.25,
                                  # Optional parameters for sub-section weighting:
                                  skill_mandatory_weight=0.5,
                                  skill_preferred_weight=0.5,
                                  education_threshold=0.7,
                                  education_min_years=4,
                                  education_mandatory_weight=0.5,
                                  education_preferred_weight=0.5,
                                  credentials_mandatory_weight=0.5,
                                  credentials_preferred_weight=0.5):
    """
    Calculates an overall match score as a weighted average of four sections:
      - Skills (mandatory and preferred)
      - Education (mandatory and preferred)
      - Responsibilities
      - Credentials (mandatory and preferred)
      
    Parameters:
      - job_json: JSON object representing the job description.
      - resume_json: JSON object representing the candidate’s resume.
      - skill_weight, education_weight, responsibilities_weight, credentials_weight: Weights for each section.
      - The remaining parameters are passed to the sub-functions for skills, education, and credentials.
      
    Returns:
      - overall_match_score (float): The overall weighted match score.
    """
    # Calculate individual section scores
    skill_score = calculate_overall_skill_match_score(job_json, resume_json,
                                                      mandatory_weight=skill_mandatory_weight,
                                                      preferred_weight=skill_preferred_weight)
    
    education_score = calculate_overall_education_match_score(job_json, resume_json,
                                                              threshold=education_threshold,
                                                              min_years=education_min_years,
                                                              mandatory_weight=education_mandatory_weight,
                                                              preferred_weight=education_preferred_weight)
    
    responsibilities_score = calculate_overall_responsibilities_match_score(job_json, resume_json)
    
    credentials_score = calculate_overall_credentials_score(job_json, resume_json,
                                                            mandatory_weight=credentials_mandatory_weight,
                                                            preferred_weight=credentials_preferred_weight)
    
    # Combine the sections using their weights (skip sections that return None)
    total_weight = 0.0
    weighted_sum = 0.0
    
    if skill_score is not None:
        weighted_sum += skill_score * skill_weight
        total_weight += skill_weight
    if education_score is not None:
        weighted_sum += education_score * education_weight
        total_weight += education_weight
    if responsibilities_score is not None:
        weighted_sum += responsibilities_score * responsibilities_weight
        total_weight += responsibilities_weight
    if credentials_score is not None:
        weighted_sum += credentials_score * credentials_weight
        total_weight += credentials_weight
    
    overall_match_score = weighted_sum / total_weight if total_weight > 0 else 0.0
    print(f"\nOverall Match Score: {overall_match_score}")
    return overall_match_score


In [17]:
%%capture
# To calculate overall skill match score:
overall_skill_score = calculate_overall_skill_match_score(job_json=job_desc_json[0],
                                                          resume_json=resume_json[1],
                                                          mandatory_weight=0.6,
                                                          preferred_weight=0.4)

# To calculate overall education match score:
overall_education_score = calculate_overall_education_match_score(job_json=job_desc_json[0],
                                                                  resume_json=resume_json[1],
                                                                  threshold=0.7,
                                                                  min_years=4,
                                                                  mandatory_weight=0.5,
                                                                  preferred_weight=0.5)

# To calculate overall responsibilities match score:
overall_resp_score = calculate_overall_responsibilities_match_score(job_json=job_desc_json[1],
                                                                    resume_json=resume_json[1])

# To calculate overall credentials match score:
overall_cred_score = calculate_overall_credentials_score(job_json=job_desc_json[0],
                                                         resume_json=resume_json[1],
                                                         mandatory_weight=0.5,
                                                         preferred_weight=0.5)

In [18]:
print("Overall Credentials Match Score:", credentials_score)
print("Overall Responsibilities Match Score:", responsibilities_score)
print("Overall Education Match Score:", education_match_score)
print("Overall Skills Score:", skills_score)

Overall Credentials Match Score: 0.6082093715667725
Overall Responsibilities Match Score: 0.5035935250432257
Overall Education Match Score: 0
Overall Skills Score: 0.4931202679872513


In [21]:
%%capture
overall_score = calculate_overall_match_score(job_json=job_desc_json[1],resume_json=resume_json[1])

In [22]:
print("Overall Match Score:", overall_score)

Overall Match Score: 0.33932111505419016
