In [None]:
%%capture
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import json
import spacy
from functools import lru_cache
import torch
from utils.semantic_similarity import nlp_similarity_cached
from utils.semantic_similarity import sentence_model

current_wd = os.getcwd()

def safe_json_loads(val):
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {val}")
        return None

job_desc_data = pd.read_excel(os.path.join(current_wd, "data", "sample_des_extractions_test_final_3.18.25.xlsx"))
resume_data = pd.read_excel(os.path.join(current_wd, "data", "sample_res_extractions_final_3.18.25.xlsx"))

job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(safe_json_loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(safe_json_loads)

job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

def append_job_id_to_job_desc(row):
    job_id = row['id']
    job_desc = row['job_desc_parsed']

    if job_desc is None:
        return None
    
    if isinstance(job_desc, str):
        job_desc = safe_json_loads(job_desc)
        if job_desc is None:
            return None
    
    job_desc['job_id'] = job_id
    return job_desc

job_desc_data['job_desc_json'] = job_desc_data.apply(append_job_id_to_job_desc, axis=1)

In [4]:
def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

# Precompute Embeddings from a list of JSONS
def precompute_embeddings_for_json_list(json_list, batch_size=64):
    """
    Iterates over a list (or Series) of JSON objects, extracts all non-empty strings
    from each JSON, computes embeddings for all unique strings in batches, and returns
    a dictionary mapping each unique string to its embedding.
    
    Parameters:
      json_list (iterable): A list (or Series) of JSON objects (dicts).
      batch_size (int): Batch size for encoding.
      
    Returns:
      dict: A mapping from string to its embedding.
    """
    all_strings = set()
    for j in json_list:
        if j is None:
            continue
        strings = extract_all_strings_from_json(j)
        all_strings.update(strings)
    
    all_strings = list(all_strings)
    if all_strings:
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb

# Now call the function with your list of job description JSON objects:
embeddings = precompute_embeddings_for_json_list(job_desc_json, batch_size=64)

In [14]:
import json

# ---------------------------
# Helper: Safe Average
# ---------------------------
def safe_average(values):
    valid = [v for v in values if v is not None]
    return sum(valid) / len(valid) if valid else None

# ---------------------------
# SECTION 1: SKILL MATCHING
# ---------------------------
def extract_job_mandatory_skills(job_json):
    return job_json.get("mandatory", {}).get("hard_skills", [])

def extract_job_preferred_skills(job_json):
    return job_json.get("preferred", {}).get("hard_skills", [])

def extract_resume_skills(resume_json):
    return resume_json.get("skills", [])

def compute_required_skill_similarity(candidate_skill, job_required_skill):
    # Normalize candidate_skill into groups.
    if isinstance(candidate_skill, str):
        candidate_groups = [[candidate_skill]]
    elif isinstance(candidate_skill, list):
        if candidate_skill and isinstance(candidate_skill[0], list):
            candidate_groups = candidate_skill
        else:
            candidate_groups = [candidate_skill]
    else:
        candidate_groups = []
    
    # Normalize job_required_skill into groups.
    if isinstance(job_required_skill, list) and job_required_skill:
        if not isinstance(job_required_skill[0], list):
            job_required_groups = [job_required_skill]
        else:
            job_required_groups = job_required_skill
    else:
        job_required_groups = []
    
    if not candidate_groups or not job_required_groups:
        return None  # Missing requirements
    
    best_overall = 0.0
    for req_group in job_required_groups:
        print(f"\nProcessing Job Requirement Group: {req_group}")
        best_for_req = 0.0
        for cand_group in candidate_groups:
            candidate_term_avgs = []
            print(f"  Evaluating Candidate Group: {cand_group}")
            for cand_term in cand_group:
                sims = []
                for req_term in req_group:
                    sim = nlp_similarity_cached(cand_term, req_term)
                    print(f"    '{cand_term}' vs. '{req_term}': similarity = {sim}")
                    sims.append(sim)
                if sims:
                    avg_sim = sum(sims) / len(sims)
                    print(f"    => Average similarity for '{cand_term}': {avg_sim}")
                    candidate_term_avgs.append(avg_sim)
            if candidate_term_avgs:
                group_avg = max(candidate_term_avgs)
                print(f"  => Best average for group {cand_group}: {group_avg}")
                if group_avg == 1.0:
                    return 1.0
                if group_avg > best_for_req:
                    best_for_req = group_avg
        if best_for_req > best_overall:
            best_overall = best_for_req
    print(f"\n   --> Best overall similarity: {best_overall}")
    return best_overall

def calculate_skill_match_score(job_json, resume_json, skill_type='mandatory'):
    if skill_type == 'mandatory':
        job_skills = extract_job_mandatory_skills(job_json)
    else:
        job_skills = extract_job_preferred_skills(job_json)
    
    if not job_skills:
        print(f"=> No {skill_type} skill requirements specified.")
        return None
    
    resume_skills = extract_resume_skills(resume_json)
    requirement_scores = []
    for req in job_skills:
        job_required_skill = req.get("skill", [])
        min_years_required = req.get("minyears", [0])[0]
        print("----------------------------------------------------")
        print("Processing Job Skill Requirement:")
        print(f"  Requirement: {job_required_skill}")
        print(f"  Minimum Years Required: {min_years_required}")
        best_match = 0.0
        for candidate in resume_skills:
            candidate_years = candidate.get("years", 0)
            if candidate_years >= min_years_required:
                for candidate_skill in candidate.get("skill", []):
                    sim = compute_required_skill_similarity(candidate_skill, job_required_skill)
                    if sim is not None and sim > best_match:
                        best_match = sim
        print(f"  Best match for requirement: {best_match}")
        requirement_scores.append(best_match)
    overall_skill = safe_average(requirement_scores)
    print(f"Overall Skill Match Score for {skill_type}: {overall_skill}")
    return overall_skill

def calculate_overall_skill_match_score(job_json, resume_json, mandatory_weight=0.5, preferred_weight=0.5):
    mand_score = calculate_skill_match_score(job_json, resume_json, skill_type='mandatory')
    pref_score = calculate_skill_match_score(job_json, resume_json, skill_type='preferred')
    overall_skill = safe_average([mand_score, pref_score])
    job_id = job_json.get("id")
    return {
        "job_id": job_id,
        "overall_skill_score": overall_skill,
        "mandatory_skill_score": mand_score,
        "preferred_skill_score": pref_score
    }

def calculate_mandatory_skill_score(job_json, resume_json):
    mand_score = calculate_skill_match_score(job_json, resume_json, skill_type='mandatory')
    job_id = job_json.get("id")
    return {"job_id": job_id, "mandatory_skill_score": mand_score}

def calculate_preferred_skill_score(job_json, resume_json):
    pref_score = calculate_skill_match_score(job_json, resume_json, skill_type='preferred')
    job_id = job_json.get("id")
    return {"job_id": job_id, "preferred_skill_score": pref_score}

# ---------------------------
# SECTION 2: RESPONSIBILITIES MATCHING
# ---------------------------
def extract_job_responsibilities_hard_skills(job_json):
    return job_json.get("responsibility", {}).get("hard_skills", [])

def calculate_responsibilities_match_score(job_json, resume_json):
    job_responsibilities = extract_job_responsibilities_hard_skills(job_json)
    if not job_responsibilities:
        print("=> No responsibilities specified in job description.")
        return None
    candidate_responsibilities = extract_resume_skills(resume_json)
    responsibility_scores = []
    for resp in job_responsibilities:
        required_resp = resp.get("skill", [])
        best_sim = 0.0
        for candidate in candidate_responsibilities:
            candidate_resp = candidate.get("skill", [])
            sim = compute_required_skill_similarity(candidate_resp, required_resp)
            if sim is not None and sim > best_sim:
                best_sim = sim
        responsibility_scores.append(best_sim)
    overall_resp = safe_average(responsibility_scores)
    return overall_resp

def calculate_overall_responsibilities_match_score(job_json, resume_json):
    overall_resp = calculate_responsibilities_match_score(job_json, resume_json)
    job_id = job_json.get("id")
    return {"job_id": job_id, "overall_responsibilities_score": overall_resp}

# ---------------------------
# SECTION 3: EDUCATION MATCHING
# ---------------------------
EDU_RANK = {
    "High School Diploma": 1,
    "Vocational": 1,
    "Associate's": 2,
    "Current Bachelor's Student": 3,
    "Some Bachelor's": 3,
    "Bachelor’s": 4,
    "Bachelor's": 4,
    "Some Master's": 5,
    "Current Master's Student": 5,
    "Master’s": 6,
    "Master's": 6,
    "PhD": 7,
    "Postdoctoral": 8
}

def extract_job_education_requirements(job_json):
    mandatory_edu = job_json.get("mandatory", {}).get("education", [])
    preferred_edu = job_json.get("preferred", {}).get("education", [])
    return mandatory_edu, preferred_edu

def extract_resume_education(resume_json):
    return resume_json.get("education", [])

def candidate_has_education_level(resume_education, required_rank):
    for edu in resume_education:
        level = edu.get("education_level", "")
        level_rank = EDU_RANK.get(level, 0)
        if level_rank >= required_rank:
            return True
    return False

def get_required_field_score(resume_education, resume_experience, required_fields, must_have_formal, required_rank, threshold=0.6, min_years=4, ignore_threshold=False):
    print(f"\n== Formal Education Matching for Required Fields: {required_fields} ==")
    similarity_scores = []
    for edu in resume_education:
        level = edu.get("education_level", "")
        candidate_rank = EDU_RANK.get(level, 0)
        if candidate_rank >= required_rank:
            for candidate_major in edu.get("major", []):
                for req_field in required_fields:
                    sim_score = nlp_similarity_cached(candidate_major, req_field)
                    print(f"Comparing '{candidate_major}' with '{req_field}': {sim_score}")
                    if not ignore_threshold:
                        effective_threshold = 0.95 if req_field.lower() != "related" else threshold
                        if sim_score >= effective_threshold:
                            similarity_scores.append(sim_score)
                    else:
                        similarity_scores.append(sim_score)
    if similarity_scores:
         avg_score = sum(similarity_scores) / len(similarity_scores)
         print(f"=> Average Similarity Score from Formal Education: {avg_score}\n")
         return avg_score
    else:
         print("=> No formal education match found; using experience fallback...\n")
         return get_equivalent_experience_score(resume_experience, required_fields, threshold=threshold, min_years=min_years)

def get_equivalent_experience_score(resume_experience, field_of_study_list, threshold=0.6, min_years=4):
    print("\n== Experience Matching (Weighted Score Calculation) ==")
    total_years = 0.0
    weighted_sum = 0.0
    for exp in resume_experience:
        candidate_fields = exp.get("field_of_study", [])
        job_titles = exp.get("background", [])
        job_title_str = ", ".join(job_titles) if job_titles else "Unknown"
        years = exp.get("years", 0)
        max_sim = 0.0
        for candidate_field in candidate_fields:
            for req_field in field_of_study_list:
                if req_field.lower() == "related":
                    continue
                sim_score = nlp_similarity_cached(candidate_field, req_field)
                print(f"Job: '{job_title_str}' | '{candidate_field}' vs. '{req_field}': {sim_score}")
                if sim_score > max_sim:
                    max_sim = sim_score
        if max_sim >= threshold:
            weighted_sum += max_sim * years
            total_years += years
            print(f"=> Using max similarity {max_sim} for '{job_title_str}' with {years} years (Contribution: {max_sim * years})")
    if total_years >= min_years and total_years > 0:
        avg_exp = weighted_sum / total_years
        print(f"=> Total Relevant Experience: {total_years} years (Required: {min_years})")
        print(f"=> Weighted Average Experience Score: {avg_exp}\n")
        return avg_exp
    else:
        print(f"=> Total Relevant Experience: {total_years} years (Required: {min_years}) -- Not enough experience.\n")
        return 0.0

def meets_education_requirement(requirement, resume_education, resume_experience, threshold=0.7, min_years=4, allow_fallback=False, job_json=None, resume_json=None):
    print("\n========== Checking Single Education Requirement ==========")
    print("Job Requirement:")
    print(json.dumps(requirement, indent=4))
    
    req_fields = requirement.get("field_of_study", [])
    req_levels = requirement.get("education_level", [])
    
    must_have_formal = True
    for lvl in req_levels:
        if "or experience" in lvl.lower():
            must_have_formal = False
            print("=> Job accepts equivalent experience in lieu of formal education.\n")
            break
    if allow_fallback:
        must_have_formal = False
    
    max_required_rank = 0
    for lvl in req_levels:
        lvl_rank = EDU_RANK.get(lvl, 0)
        if lvl_rank > max_required_rank:
            max_required_rank = lvl_rank
            print(f"=> Updated Required Education Rank to: {max_required_rank} based on level '{lvl}'\n")
    
    level_scores = []
    if req_fields:
        print(f"=> Required Field(s) of Study: {json.dumps(req_fields, indent=4)}\n")
        if must_have_formal:
            formal_score = get_required_field_score(resume_education, resume_experience, req_fields, must_have_formal, max_required_rank, threshold, min_years, ignore_threshold=False)
            print(f"=> Formal Education Score: {formal_score}\n")
            level_scores.append(formal_score)
        else:
            formal_score = get_required_field_score(resume_education, resume_experience, req_fields, must_have_formal, max_required_rank, threshold, min_years, ignore_threshold=True)
            exp_score = get_equivalent_experience_score(resume_experience, req_fields, threshold, min_years)
            print(f"=> Formal Education Score: {formal_score}")
            print(f"=> Experience Score: {exp_score}\n")
            combined_score = (formal_score + exp_score) / 2 if (formal_score > 0 and exp_score > 0) else (formal_score or exp_score)
            print(f"=> Combined Score: {combined_score}\n")
            level_scores.append(combined_score)
    else:
        if must_have_formal:
            level_scores.append(1.0 if candidate_has_education_level(resume_education, max_required_rank) else 0.0)
        else:
            level_scores.append(1.0 if candidate_has_education_level(resume_education, max_required_rank)
                                else get_equivalent_experience_score(resume_experience, ["Any"], threshold, min_years))
    
    overall_req_score = safe_average(level_scores) if level_scores else 0.0
    print(f"=> Final Composite Education Score for Requirement: {overall_req_score}\n")
    return overall_req_score

def calculate_mandatory_education_score(job_json, resume_json, threshold=0.7, min_years=4):
    mand_requirements, _ = extract_job_education_requirements(job_json)
    resume_edu = extract_resume_education(resume_json)
    resume_exp = extract_professional_background(resume_json)
    if not mand_requirements:
        print("=> No mandatory education requirements specified.\n")
        return None
    mandatory_scores = []
    for req in mand_requirements:
        print("\n--- Checking Mandatory Education Requirement ---")
        score = meets_education_requirement(req, resume_edu, resume_exp, threshold, min_years, job_json=job_json, resume_json=resume_json)
        if score == 0:
            print("!!! Mandatory education requirement NOT met. Returning 0.0 !!!\n")
            return 0.0
        print(f"=> Mandatory Education Score for requirement: {score}\n")
        mandatory_scores.append(score)
    mand_avg = safe_average(mandatory_scores)
    return mand_avg

def calculate_preferred_education_score(job_json, resume_json, threshold=0.7, min_years=4):
    _, pref_requirements = extract_job_education_requirements(job_json)
    resume_edu = extract_resume_education(resume_json)
    resume_exp = extract_professional_background(resume_json)
    if not pref_requirements:
        print("=> No preferred education requirements specified.\n")
        return None
    preferred_scores = []
    for req in pref_requirements:
        print("\n--- Checking Preferred Education Requirement ---")
        print("Requirement:")
        print(json.dumps(req, indent=4))
        score = meets_education_requirement(req, resume_edu, resume_exp, threshold, min_years, allow_fallback=True, job_json=job_json, resume_json=resume_json)
        print(f"=> Preferred Education Score for requirement: {score}\n")
        preferred_scores.append(score)
    pref_avg = safe_average(preferred_scores)
    return pref_avg

def calculate_overall_education_match_score(job_json, resume_json, threshold=0.7, min_years=4, mandatory_weight=0.5, preferred_weight=0.5):
    print("========== Starting Education Match Score Calculation ==========\n")
    mand_score = calculate_mandatory_education_score(job_json, resume_json, threshold, min_years)
    pref_score = calculate_preferred_education_score(job_json, resume_json, threshold, min_years)
    overall_edu = safe_average([mand_score, pref_score])
    print(f"\n========== Overall Education Match Score: {overall_edu} ==========\n")
    job_id = job_json.get("id")
    return {
        "job_id": job_id,
        "overall_education_score": overall_edu,
        "mandatory_education_score": mand_score,
        "preferred_education_score": pref_score
    }

# ---------------------------
# SECTION 4: CREDENTIALS MATCHING
# ---------------------------
def extract_job_credentials(job_json):
    mandatory_creds = job_json.get("mandatory", {}).get("credentials", [])
    preferred_creds = job_json.get("preferred", {}).get("credentials", [])
    print("Extracted Job Credentials:")
    print(f"  Mandatory: {mandatory_creds}")
    print(f"  Preferred: {preferred_creds}\n")
    return mandatory_creds, preferred_creds

def extract_resume_credentials(resume_json):
    creds = resume_json.get("credentials", [])
    print("Extracted Resume Credentials:")
    print(f"  {creds}\n")
    return creds

def match_credentials(required_creds, resume_creds):
    if not required_creds:
        return None
    req_scores = []
    for req_cred_obj in required_creds:
        for req_cred in req_cred_obj.get("credential", []):
            best_sim = 0.0
            for cred_obj in resume_creds:
                candidate_creds = cred_obj.get("credential", [])
                for cand_cred in candidate_creds:
                    sim = nlp_similarity_cached(req_cred, cand_cred).item()
                    if sim > best_sim:
                        best_sim = sim
            req_scores.append(best_sim)
            print(f"Best similarity for required credential '{req_cred}': {best_sim}")
    overall_cred = sum(req_scores) / len(req_scores) if req_scores else None
    print(f"Average Credential Similarity: {overall_cred}\n")
    return overall_cred

def calculate_mandatory_credentials_score(job_json, resume_json):
    job_mandatory, _ = extract_job_credentials(job_json)
    resume_creds = extract_resume_credentials(resume_json)
    if not job_mandatory:
        print("=> No mandatory credentials specified.\n")
        return None
    score = match_credentials(job_mandatory, resume_creds)
    print(f"Mandatory Credentials Score: {score}\n")
    return score

def calculate_preferred_credentials_score(job_json, resume_json):
    _, job_preferred = extract_job_credentials(job_json)
    resume_creds = extract_resume_credentials(resume_json)
    if not job_preferred:
        print("=> No preferred credentials specified.\n")
        return None
    score = match_credentials(job_preferred, resume_creds)
    print(f"Preferred Credentials Score: {score}\n")
    return score

def calculate_overall_credentials_score(job_json, resume_json, mandatory_weight=0.5, preferred_weight=0.5):
    mand_score = calculate_mandatory_credentials_score(job_json, resume_json)
    pref_score = calculate_preferred_credentials_score(job_json, resume_json)
    overall_cred = safe_average([mand_score, pref_score])
    print(f"Overall Credentials Match Score: {overall_cred}\n")
    job_id = job_json.get("id")
    return {
        "job_id": job_id,
        "overall_credentials_score": overall_cred,
        "mandatory_credentials_score": mand_score,
        "preferred_credentials_score": pref_score
    }

# ---------------------------
# SECTION 5: PROFESSIONAL BACKGROUND MATCHING
# ---------------------------
def extract_professional_background(resume_json):
    return resume_json.get("professional_background", [])

# get_background_match_score and get_industry_match_score are defined below.

def get_background_match_score(job_req_background, candidate_prof_background, job_details, threshold=0.6, min_years_required=4):
    """
    Computes a weighted average similarity score for candidate background (role) matches.
    
    Special Case: If any job requirement group contains "Work Experience" or "Working Experience" (case-insensitive),
    then we do not compute semantic similarity. Instead, we sum the candidate's total background years.
    If the total meets or exceeds the minimum required years, returns 1.0; otherwise, 0.0.
    
    Otherwise, for each candidate background entry:
      - Compare each candidate term from the "background" field against each job requirement group.
      - If a group has multiple elements, compute the average similarity.
      - Take the maximum similarity across candidate terms.
      - If that maximum meets the threshold, add candidate's years weighted by that maximum.
    
    Returns weighted_avg = sum(max_similarity * years) / sum(years), or 0.0 if total years < min_years_required.
    """
    print("\n== Starting Background Match Score Calculation ==")
    print("Job Requirement Background Groups:")
    for idx, group in enumerate(job_req_background, start=1):
        print(f"  Group {idx}: {group}")
    
    # Special mode: "Work Experience"
    work_experience_mode = any(
        any(term.lower() in ["work experience", "working experience"] for term in group)
        for group in job_req_background
    )
    
    if work_experience_mode:
        print("=> 'Work Experience' detected. Using special mode (years only).")
        total_candidate_years = sum(entry.get("years", 0) for entry in candidate_prof_background)
        print(f"   Total Candidate Background Years: {total_candidate_years}")
        if total_candidate_years >= min_years_required:
            print(f"=> Candidate meets the work experience requirement (Required: {min_years_required} years). Returning 1.0.\n")
            return 1.0
        else:
            print(f"=> Candidate does NOT meet the work experience requirement (Required: {min_years_required} years). Returning 0.0.\n")
            return 0.0

    total_years = 0.0
    weighted_sum = 0.0

    for entry in candidate_prof_background:
        years = entry.get("years", 0)
        candidate_terms = entry.get("background", [])
        print("\n--- Processing Candidate Background Entry ---")
        print(f"Candidate Background Terms: {candidate_terms}")
        entry_max = 0.0
        for candidate_term in candidate_terms:
            group_scores = []
            print(f"\nEvaluating Candidate Term: '{candidate_term}'")
            for group in job_req_background:
                if len(group) > 1:
                    sims = [nlp_similarity_cached(candidate_term, term) for term in group]
                    group_score = sum(sims) / len(sims)
                    print(f"  Against Group {group}: similarities = {sims}, average = {group_score}")
                else:
                    group_score = nlp_similarity_cached(candidate_term, group[0])
                    print(f"  Against Group {group}: similarity = {group_score}")
                group_scores.append(group_score)
            if group_scores:
                candidate_term_score = max(group_scores)
                print(f"=> Max similarity for candidate term '{candidate_term}': {candidate_term_score}")
                if candidate_term_score > entry_max:
                    entry_max = candidate_term_score
        print(f"Maximum similarity for candidate entry: {entry_max}")
        if entry_max >= threshold:
            weighted_sum += entry_max * years
            total_years += years
            print(f"=> Adding {years} years weighted by {entry_max} (Contribution: {entry_max * years}).")
            print(f"=> Cumulative Relevant Background Years: {total_years}\n")
    
    if total_years >= min_years_required and total_years > 0:
        avg_bg_score = weighted_sum / total_years
        print(f"=> Total Background Experience: {total_years} years (Required: {min_years_required} years)")
        print(f"=> Weighted Average Background Score: {avg_bg_score}\n")
        return avg_bg_score
    else:
        print(f"=> Total Background Experience: {total_years} years (Required: {min_years_required} years) -- Not enough experience. Returning 0.0.\n")
        return 0.0

def get_industry_match_score(req_industries, candidate_prof_background, threshold=0.6, min_years_required=4):
    """
    Computes a weighted average similarity score for candidate industry matches.
    
    For each candidate background entry:
      - Compare each candidate industry (from "industry" field) against each required industry term.
      - Take the maximum similarity for the entry.
      - If that maximum meets or exceeds the threshold, add candidate's years weighted by that maximum.
    
    Returns weighted_avg = sum(max_similarity * years) / sum(years), or 0.0 if total years < min_years_required.
    """
    print("\n== Starting Industry Match Score Calculation ==")
    print(f"Job Requirement Industries: {req_industries}")
    total_years = 0.0
    weighted_sum = 0.0
    for entry in candidate_prof_background:
        years = entry.get("years", 0)
        candidate_industries = entry.get("industry", [])
        print("\n--- Processing Candidate Industry Entry ---")
        print(f"Candidate Industries: {candidate_industries}")
        entry_max = 0.0
        for cand_ind in candidate_industries:
            for req_ind in req_industries:
                sim = nlp_similarity_cached(cand_ind, req_ind)
                print(f"  '{cand_ind}' vs. '{req_ind}': similarity = {sim}")
                if sim > entry_max:
                    entry_max = sim
        print(f"=> Maximum industry similarity for this entry: {entry_max}")
        if entry_max >= threshold:
            weighted_sum += entry_max * years
            total_years += years
            print(f"=> Adding {years} years weighted by {entry_max} (Contribution: {entry_max * years}).")
    if total_years >= min_years_required and total_years > 0:
        avg_ind_score = weighted_sum / total_years
        print(f"=> Total Industry Experience: {total_years} years (Required: {min_years_required} years)")
        print(f"=> Weighted Average Industry Score: {avg_ind_score}\n")
        return avg_ind_score
    else:
        print(f"=> Total Industry Experience: {total_years} years (Required: {min_years_required} years) -- Not enough experience. Returning 0.0.\n")
        return 0.0

def calculate_mandatory_background_score(job_json, resume_json, threshold=0.6):
    job_req = job_json.get("mandatory", {}).get("professional_background", [])
    candidate_background = extract_professional_background(resume_json)
    job_details = job_json.get("details", {})
    
    if not job_req:
        print("=> No mandatory professional background requirements specified.\n")
        return None, None
    
    bg_scores = []
    ind_scores = []
    for req in job_req:
        req_minyears = req.get("minyears", [0])[0]
        req_background = req.get("background", [])
        req_industries = req.get("industry", [])
        print(f"\n--- Processing Mandatory Background Requirement (Min Years: {req_minyears}) ---")
        bg_score = get_background_match_score(req_background, candidate_background, job_details, threshold, req_minyears)
        print(f"=> Background Score for requirement: {bg_score}")
        if req_industries:
            ind_score = get_industry_match_score(req_industries, candidate_background, threshold, req_minyears)
            print(f"=> Industry Score for requirement: {ind_score}\n")
        else:
            ind_score = None
            print("=> No industry requirements specified for this requirement.\n")
        total_candidate_years = sum(entry.get("years", 0) for entry in candidate_background)
        if total_candidate_years < req_minyears:
            print("=> Candidate does not meet the minimum background years for this requirement. Setting background score to 0.")
            bg_score = 0.0
        bg_scores.append(bg_score)
        ind_scores.append(ind_score)
    mand_bg_avg = safe_average(bg_scores)
    mand_ind_avg = safe_average(ind_scores) if ind_scores and any(ind is not None for ind in ind_scores) else None
    return mand_bg_avg, mand_ind_avg

def calculate_preferred_background_score(job_json, resume_json, threshold=0.6):
    job_req = job_json.get("preferred", {}).get("professional_background", [])
    candidate_background = extract_professional_background(resume_json)
    job_details = job_json.get("details", {})
    
    if not job_req:
        print("=> No preferred professional background requirements specified.\n")
        return None, None
    
    bg_scores = []
    ind_scores = []
    for req in job_req:
        req_minyears = req.get("minyears", [0])[0]
        req_background = req.get("background", [])
        req_industries = req.get("industry", [])
        print(f"\n--- Processing Preferred Background Requirement (Min Years: {req_minyears}) ---")
        bg_score = get_background_match_score(req_background, candidate_background, job_details, threshold, req_minyears)
        print(f"=> Background Score for requirement: {bg_score}")
        if req_industries:
            ind_score = get_industry_match_score(req_industries, candidate_background, threshold, req_minyears)
            print(f"=> Industry Score for requirement: {ind_score}\n")
        else:
            ind_score = None
            print("=> No industry requirements specified for this requirement.\n")
        bg_scores.append(bg_score)
        ind_scores.append(ind_score)
    pref_bg_avg = safe_average(bg_scores)
    pref_ind_avg = safe_average(ind_scores) if ind_scores and any(ind is not None for ind in ind_scores) else None
    return pref_bg_avg, pref_ind_avg

def calculate_overall_background_score(job_json, resume_json, threshold=0.6):
    mand_bg, mand_ind = calculate_mandatory_background_score(job_json, resume_json, threshold)
    pref_bg, pref_ind = calculate_preferred_background_score(job_json, resume_json, threshold)
    
    overall_bg = safe_average([mand_bg, pref_bg])
    overall_ind = safe_average([mand_ind, pref_ind])
    overall_prof_background = safe_average([x for x in [overall_bg, overall_ind] if x is not None])
    
    print(f"\n========== Overall Background (Role) Score: {overall_bg} ==========")
    if overall_ind is not None:
        print(f"========== Overall Industry Score: {overall_ind} ==========")
    else:
        print("========== No Overall Industry Score computed (industry requirements missing) ==========")
    print(f"========== Overall Professional Background Score: {overall_prof_background} ==========\n")
    
    job_id = job_json.get("id")
    return {
        "job_id": job_id,
        "mandatory_background_score": mand_bg,
        "mandatory_industry_score": mand_ind,
        "preferred_background_score": pref_bg,
        "preferred_industry_score": pref_ind,
        "overall_background_score": overall_bg,
        "overall_industry_score": overall_ind,
        "overall_professional_background_score": overall_prof_background
    }

# ---------------------------
# FINAL OVERALL MATCH SCORE
# ---------------------------
def calculate_overall_match_score(job_json, resume_json,
                                  skill_weight=0.20,
                                  education_weight=0.20,
                                  responsibilities_weight=0.20,
                                  credentials_weight=0.20,
                                  background_weight=0.20):
    """
    Calculates the final overall match score as a safe average of the available section scores.
    If a section has no requirements, its score is returned as None and is ignored in the overall average.
    Sections include:
      - Skills
      - Education
      - Responsibilities
      - Credentials
      - Professional Background (the combined background/industry score)
    """
    skill_dict = calculate_overall_skill_match_score(job_json, resume_json)
    edu_dict = calculate_overall_education_match_score(job_json, resume_json)
    resp_dict = calculate_overall_responsibilities_match_score(job_json, resume_json)
    cred_dict = calculate_overall_credentials_score(job_json, resume_json)
    bg_dict = calculate_overall_background_score(job_json, resume_json)
    
    sections = []
    if skill_dict["overall_skill_score"] is not None:
        sections.append(skill_dict["overall_skill_score"])
    if edu_dict["overall_education_score"] is not None:
        sections.append(edu_dict["overall_education_score"])
    if resp_dict["overall_responsibilities_score"] is not None:
        sections.append(resp_dict["overall_responsibilities_score"])
    if cred_dict["overall_credentials_score"] is not None:
        sections.append(cred_dict["overall_credentials_score"])
    if bg_dict["overall_professional_background_score"] is not None:
        sections.append(bg_dict["overall_professional_background_score"])
    
    overall_match = safe_average(sections)
    job_id = job_json.get("id")
    
    final_result = {
        "job_id": job_id,
        "skills": {**skill_dict, "job_id": job_id},
        "education": {**edu_dict, "job_id": job_id},
        "responsibilities": {**resp_dict, "job_id": job_id},
        "credentials": {**cred_dict, "job_id": job_id},
        "professional_background": {**bg_dict, "job_id": job_id},
        "overall_match_score": overall_match
    }
    
    print(f"\nFinal Overall Match Score: {overall_match}")
    return final_result

# ---------------------------
# Example Usage:
# (Assuming job_desc_json and resume_json are your lists/Series of job and resume JSON objects)
result = calculate_overall_match_score(job_json=job_desc_json[47], resume_json=resume_json[2])
print(result)

----------------------------------------------------
Processing Job Skill Requirement:
  Requirement: [['troubleshoot hardware issues'], ['resolve hardware issues']]
  Minimum Years Required: 0

Processing Job Requirement Group: ['troubleshoot hardware issues']
  Evaluating Candidate Group: ['Gathered requirements', 'Tableau Dashboards']
    'Gathered requirements' vs. 'troubleshoot hardware issues': similarity = 0.40550625324249284
    => Average similarity for 'Gathered requirements': 0.40550625324249284
    'Tableau Dashboards' vs. 'troubleshoot hardware issues': similarity = 0.34488356113433855
    => Average similarity for 'Tableau Dashboards': 0.34488356113433855
  => Best average for group ['Gathered requirements', 'Tableau Dashboards']: 0.40550625324249284

Processing Job Requirement Group: ['resolve hardware issues']
  Evaluating Candidate Group: ['Gathered requirements', 'Tableau Dashboards']
    'Gathered requirements' vs. 'resolve hardware issues': similarity = 0.4436820348

In [35]:
def calculate_scores_for_all_jobs(job_desc_list,
                                  resume_json,
                                  skill_weight=0.25,
                                  education_weight=0.25,
                                  responsibilities_weight=0.25,
                                  credentials_weight=0.25,
                                  skill_mandatory_weight=0.5,
                                  skill_preferred_weight=0.5,
                                  education_threshold=0.7,
                                  education_min_years=4,
                                  education_mandatory_weight=0.5,
                                  education_preferred_weight=0.5,
                                  credentials_mandatory_weight=0.5,
                                  credentials_preferred_weight=0.5):
    """
    Iterates over each job description in `job_desc_list`. For each entry, calls
    `calculate_overall_match_score` and returns a dictionary of sub-scores plus
    the overall match score, keyed by the index.

    Parameters:
      - job_desc_list: A list of job description JSON objects.
      - resume_json: A single JSON object for the candidate’s resume 
        (or adapt to pick different resumes if needed).
      - The remaining parameters map to the weights and thresholds used in 
        calculate_overall_match_score.

    Returns:
      A dictionary where each key is the index of the job in `job_desc_list` and
      each value is the dictionary returned by `calculate_overall_match_score`.
    """

    results_dict = {}

    # Loop by numeric index
    for i in range(len(job_desc_list)):
        job_json = job_desc_list[i]

        # Call the UPDATED function that returns a dictionary with sub-scores
        all_scores = calculate_overall_match_score(
            job_json=job_json,
            resume_json=resume_json,
            skill_weight=skill_weight,
            education_weight=education_weight,
            responsibilities_weight=responsibilities_weight,
            credentials_weight=credentials_weight,
            skill_mandatory_weight=skill_mandatory_weight,
            skill_preferred_weight=skill_preferred_weight,
            education_threshold=education_threshold,
            education_min_years=education_min_years,
            education_mandatory_weight=education_mandatory_weight,
            education_preferred_weight=education_preferred_weight,
            credentials_mandatory_weight=credentials_mandatory_weight,
            credentials_preferred_weight=credentials_preferred_weight
        )

        # Store the entire dictionary of sub-scores/overall score in results_dict[i]
        results_dict[i] = all_scores

    return results_dict

In [36]:
%%capture
results = calculate_scores_for_all_jobs(job_desc_list=job_desc_json, resume_json=resume_json[3])

In [38]:
results

{0: 0.6945423344593673, 1: 0.46425202923516434, 2: 0.7222696653670735, 3: 0.5058364693512858, 4: 0.7248497107806496, 5: 0.49327802761561346, 6: 0.7350529442644782, 7: 0.5274603684388456, 8: 0.534323041078945, 9: 0.7034989522415257, 10: 0.4342846415523026, 11: 0.49271499050692436, 12: 0.6996513994721074, 13: 0.4415457101228336, 14: 0.4913026793641567, 15: 0.709271973531161, 16: 0.6574579275213182, 17: 0.6805086937709114, 18: 0.7086047556251287, 19: 0.25419226121157407, 20: 0.324887256351886, 21: 0.6370420522677402, 22: 0.6819675898031582, 23: 0.8141586688524556, 24: 0.5150652794664106, 25: 0.46755913784727454, 26: 0.5701971420397361, 27: 0.47820643663761164, 28: 0.44899077898977946, 29: 0.4499668437987566, 30: 0.2432162081822753, 31: 0.8411677721887827, 32: 0.5111429532213758, 33: 0.5258161975218664, 34: 0.753465311601758, 35: 0.7579539145919539, 36: 0.46919018459921896, 37: 0.48758172389201737, 38: 0.7549114672777554, 39: 0.6466276091523468, 40: 0.7351141523707796, 41: 0.68302356440219

## Evaluation

In [172]:
import json 

job_desc_json_pprint = json.dumps(job_desc_json[38], indent=4)
print(job_desc_json_pprint)

{
    "details": {
        "wage": [],
        "benefits": {
            "fsa": false,
            "hsa": false,
            "bonus": false,
            "other": [],
            "dental": false,
            "equity": false,
            "vision": false,
            "medical": false,
            "401k_match": false,
            "mental_health": false,
            "unlimited_pto": false,
            "tuition_reimbursement": false
        },
        "location": [
            {
                "city": "Ewing",
                "state": "NJ",
                "country": "US"
            }
        ],
        "job_title": [
            "C# .NET Azure Lead Position"
        ],
        "tax_terms": [
            "Contract",
            "Contract W2",
            "Contract Corp to Corp",
            "Contract Independent"
        ],
        "wfh_policy": [
            "Hybrid"
        ],
        "company_name": [],
        "company_stage": [],
        "work_schedule": [
            "Hybrid"
       

In [72]:
resume_pprint = json.dumps(resume_json[3], indent=4)
print(resume_pprint)

{
    "skills": [
        {
            "skill": [
                [
                    "Software Development Lifecycle Management"
                ]
            ],
            "years": 2.5
        },
        {
            "skill": [
                [
                    "Advanced Java Programming"
                ]
            ],
            "years": 2.5
        },
        {
            "skill": [
                [
                    "Frontend Development with React.js"
                ]
            ],
            "years": 2.5
        },
        {
            "skill": [
                [
                    "Spring Framework Expertise"
                ]
            ],
            "years": 2.5
        },
        {
            "skill": [
                [
                    "Container Orchestration with Kubernetes"
                ]
            ],
            "years": 2.5
        },
        {
            "skill": [
                [
                    "Microservices Architecture Imp