In [1]:
%%capture
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import json
import spacy
from functools import lru_cache
import torch
from utils.semantic_similarity import nlp_similarity_cached # import first
from utils.semantic_similarity import sentence_model # import second

current_wd = os.getcwd()

def safe_json_loads(val):
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {val}")
        return None

job_desc_data = pd.read_excel(os.path.join(current_wd, "data", "sample_des_extractions_test_final_3.25.25.xlsx"))
resume_data = pd.read_excel(os.path.join(current_wd, "data", "sample_res_extractions_final_3.26.25.xlsx"))

job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(safe_json_loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(safe_json_loads)

job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

def append_job_id_to_job_desc(row):
    job_id = row['id']
    job_desc = row['job_desc_parsed']

    if job_desc is None:
        return None
    
    if isinstance(job_desc, str):
        job_desc = safe_json_loads(job_desc)
        if job_desc is None:
            return None
    
    job_desc['job_id'] = job_id
    return job_desc

job_desc_data['job_desc_json'] = job_desc_data.apply(append_job_id_to_job_desc, axis=1)

def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

# Precompute Embeddings from a list of JSONS
def precompute_embeddings_for_json_list(json_list, batch_size=64):
    """
    Iterates over a list (or Series) of JSON objects, extracts all non-empty strings
    from each JSON, computes embeddings for all unique strings in batches, and returns
    a dictionary mapping each unique string to its embedding.
    
    Parameters:
      json_list (iterable): A list (or Series) of JSON objects (dicts).
      batch_size (int): Batch size for encoding.
      
    Returns:
      dict: A mapping from string to its embedding.
    """
    all_strings = set()
    for j in json_list:
        if j is None:
            continue
        strings = extract_all_strings_from_json(j)
        all_strings.update(strings)
    
    all_strings = list(all_strings)
    if all_strings:
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb

# Now call the function with your list of job description JSON objects:
embeddings = precompute_embeddings_for_json_list(job_desc_json, batch_size=64)

In [3]:
from utils.mandatory_skill_score import extract_job_mandatory_skills
from utils.mandatory_skill_score import extract_resume_skills

job_skills = extract_job_mandatory_skills(job_desc_json[1])
resume_skills = extract_resume_skills(resume_json[1])

print(job_skills)
print(resume_skills)

[{'skill': [['Salesforce development', 'Sales cloud'], ['Salesforce development', 'Services cloud'], ['Salesforce development', 'Financial Services cloud']], 'minyears': [0]}, {'skill': [['Salesforce.com development', 'SOQL'], ['Salesforce.com development', 'Apex']], 'minyears': [4]}, {'skill': [['SOQL']], 'minyears': [0]}, {'skill': [['Apex']], 'minyears': [0]}, {'skill': [['Lightning']], 'minyears': [0]}, {'skill': [['LWC']], 'minyears': [0]}, {'skill': [['User Management', 'User Profiles']], 'minyears': [0]}, {'skill': [['User Management', 'User Permission Sets']], 'minyears': [0]}, {'skill': [['User Management', 'Sharing Rules']], 'minyears': [0]}, {'skill': [['User Management', 'Role Hierarchy']], 'minyears': [0]}, {'skill': [['User Management', 'User Setup']], 'minyears': [0]}, {'skill': [['Standard Object testing', 'Workflows'], ['Standard Object modifications', 'Workflows']], 'minyears': [0]}, {'skill': [['Standard Object testing', 'Approval Processes'], ['Standard Object modif

In [13]:
import math
from utils.semantic_similarity import nlp_similarity_cached


def safe_average(values):
    valid = [v for v in values if v is not None]
    return sum(valid) / len(valid) if valid else None


def extract_job_mandatory_skills(job_json):
    return job_json.get("mandatory", {}).get("hard_skills", [])


def extract_resume_skills(resume_json):
    return resume_json.get("skills", [])


def compute_group_similarity(candidate_group, required_group):
    """
    For a multi-term required_group (e.g. ["Salesforce dev", "Apex"]),
    find the best match for each required term among candidate_group,
    then average those best matches.
    """
    if not candidate_group or not required_group:
        return 0.0

    sims_for_required_terms = []
    for req_term in required_group:
        best_for_req_term = 0.0
        for cand_term in candidate_group:
            sim = nlp_similarity_cached(cand_term, req_term)
            if sim > best_for_req_term:
                best_for_req_term = sim
        sims_for_required_terms.append(best_for_req_term)
    return sum(sims_for_required_terms) / len(sims_for_required_terms)


def compute_required_skill_similarity(candidate_skill_item, job_required_skill):
    """
    Each job_required_skill can be a list of multiple sub-groups
    e.g. [ ["Salesforce dev","Apex"], ["Salesforce.com development","Apex"] ].
    We find the best among them.
    """
    if not job_required_skill:
        return 0.0

    # If the job_required_skill is a single group, wrap it for uniform iteration
    if isinstance(job_required_skill[0], str):
        job_required_skill = [job_required_skill]

    candidate_group = candidate_skill_item.get("skill", [])
    if not candidate_group:
        return 0.0

    best_sim = 0.0
    for req_group in job_required_skill:
        sim_score = compute_group_similarity(candidate_group, req_group)
        if sim_score > best_sim:
            best_sim = sim_score
    return best_sim


def aggregate_best_entries(resume_skills, job_required_skill):
    """
    Returns a list of dicts, each dict having:
        {
          "job_id": ...,
          "sim": <best similarity for that job_id>,
          "years": <max years for that job_id>
        }

    We do NOT sum multiple lines from the same job_id; we take whichever line
    has the highest similarity, and whichever has the maximum years, for that job_id.
    """
    # Temporary map of job_id -> {"sim": float, "years": float}
    by_job_id = {}

    for cand_skill_item in resume_skills:
        jbid = cand_skill_item.get("job_id", "")
        cand_years = cand_skill_item.get("years", 0.0)
        sim = compute_required_skill_similarity(cand_skill_item, job_required_skill)

        if jbid not in by_job_id:
            by_job_id[jbid] = {"sim": sim, "years": cand_years}
        else:
            if sim > by_job_id[jbid]["sim"]:
                by_job_id[jbid]["sim"] = sim
            if cand_years > by_job_id[jbid]["years"]:
                by_job_id[jbid]["years"] = cand_years

    # Convert to a list
    result = []
    for jbid, vals in by_job_id.items():
        # Only keep if similarity > 0
        if vals["sim"] > 0.0:
            result.append({
                "job_id": jbid,
                "sim": vals["sim"],
                "years": vals["years"]
            })
    return result


def compute_single_requirement_score(resume_skills, job_required_skill, min_years_required):
    """
    Core function that:

    1) Aggregates the best similarity & max years per job_id.
    2) Sorts them by similarity desc.
    3) Iterates in descending similarity order:
       a) If coverageUsed=0 and we encounter an empty job_id skill:
          - If it alone meets min_years_required => done
          - Else skip it
       b) If coverageUsed=0 and we encounter a real job_id skill => start summation from real job_ids
          until min_years_required is met or exhausted
       c) Once we pick coverage from empty or real, we do NOT mix them.

    Returns the final weighted similarity score for this requirement.
    """
    print("\n=== Processing Single Skill Requirement ===")
    print(f"Required skill: {job_required_skill}")
    print(f"Min years required: {min_years_required}")

    if not resume_skills or not job_required_skill:
        return 0.0

    # Gather best skill lines (one per job_id)
    best_entries = aggregate_best_entries(resume_skills, job_required_skill)

    if not best_entries:
        print("No nonzero similarity entries. Returning 0.0")
        return 0.0

    # Sort by similarity desc
    best_entries.sort(key=lambda x: x["sim"], reverse=True)

    coverage_used = 0.0
    weighted_sum = 0.0
    coverage_mode = None  # can be "real" or "empty"
    needed = float(min_years_required)

    print("\n-- Sorted Skill Entries (desc by sim) --")
    for i, e in enumerate(best_entries, 1):
        print(f" {i}) job_id='{e['job_id'] or '[EMPTY]'}', sim={e['sim']:.3f}, yrs={e['years']:.2f}")

    # Now iterate in sorted order
    for item in best_entries:
        jbid = item["job_id"]
        sim  = item["sim"]
        yrs  = item["years"]

        if coverage_used >= min_years_required:
            print("Already met coverage. Break.")
            break

        # If we haven't chosen coverage yet (coverage_mode=None):
        if coverage_mode is None:
            if jbid.strip() == "":
                # It's an empty job_id skill
                if yrs >= min_years_required:
                    # Use it alone => done
                    fraction = min_years_required / float(min_years_required)
                    weighted_sum += sim * fraction
                    coverage_used += min_years_required
                    coverage_mode = "empty"
                    print(f"Chose empty job_id skill alone: yrs={yrs}, sim={sim:.3f}, coverage_used={coverage_used:.2f}")
                    break
                else:
                    # Skip it, because it doesn't meet min years alone
                    print(f"Skipping empty job_id skill with yrs={yrs}, sim={sim:.3f}, doesn't meet min_years={min_years_required}.")
                    continue
            else:
                # It's a real job_id => start real coverage accumulation
                coverage_mode = "real"
                use_years = min(yrs, needed)
                fraction = use_years / float(min_years_required)
                weighted_sum += sim * fraction
                coverage_used += use_years
                needed -= use_years
                print(f"Starting real coverage with job_id={jbid}, sim={sim:.3f}, used_yrs={use_years:.2f}, coverage_used={coverage_used:.2f}")
        else:
            # We already picked a mode
            if coverage_mode == "empty":
                # We used an empty job_id skill that meets coverage alone => we won't accumulate more
                # So we just break or skip
                print("Already satisfied coverage with an empty job_id skill. Not adding more.")
                break
            else:
                # coverage_mode == "real"
                if jbid.strip() == "":
                    # skip empty job_id lines
                    print(f"Skipping empty job_id skill because we already started real coverage: sim={sim:.3f}")
                    continue
                else:
                    # same coverage_mode=real => accumulate partial coverage
                    use_years = min(yrs, needed)
                    fraction = use_years / float(min_years_required)
                    weighted_sum += sim * fraction
                    coverage_used += use_years
                    needed -= use_years
                    print(f"Continuing real coverage with job_id={jbid}, sim={sim:.3f}, used_yrs={use_years:.2f}, coverage_used={coverage_used:.2f}")

    # If coverage_used < min_years_required, that's partial coverage
    if coverage_used <= 0:
        print("No coverage used => 0.0 final.")
        return 0.0

    if coverage_used < min_years_required:
        # partial coverage scenario
        print(f"=> Partial coverage: used {coverage_used} out of {min_years_required}")
        # Usually weighted_sum is already scaled fractionally, so we can just return it
        return weighted_sum

    # coverage_used >= min_years_required => full coverage
    print(f"=> Full coverage. Weighted sum = {weighted_sum:.3f}")
    return weighted_sum


def calculate_skill_match_score(job_json, resume_json):
    """
    Iterates over each mandatory skill requirement in the job JSON,
    calls compute_single_requirement_score, and returns their average.
    """
    print("\n======================")
    print("BEGIN: calculate_skill_match_score")
    print("======================")

    job_skills = extract_job_mandatory_skills(job_json)
    if not job_skills:
        print("No mandatory skill requirements found. Returning None.")
        return None

    resume_skills = extract_resume_skills(resume_json)
    requirement_scores = []
    for idx, req in enumerate(job_skills, start=1):
        print(f"\n********** REQUIREMENT #{idx} **********")
        job_required_skill = req.get("skill", [])
        min_years_required = req.get("minyears", [0])[0]

        score_for_this_req = compute_single_requirement_score(
            resume_skills,
            job_required_skill,
            min_years_required
        )
        print(f"[REQUIREMENT #{idx}] Weighted Similarity Score = {score_for_this_req:.3f}")
        requirement_scores.append(score_for_this_req)

    overall_skill = safe_average(requirement_scores)
    if overall_skill is None:
        print("No valid scores found at all. Returning None.")
        return None
    else:
        print(f"\n=> Overall Mandatory Skill Match Score = {overall_skill:.3f}")
        return overall_skill


def calculate_mandatory_skill_score(job_json, resume_json):
    score = calculate_skill_match_score(job_json, resume_json)
    return {"mandatory_skill_score": score}


def calculate_mandatory_skill_scores(job_json_list, resume_json):
    """
    Accepts a list of job JSON objects and returns a dictionary
    mapping each job's job_id to its mandatory skill score.
    """
    results = {}
    print("\n##################################")
    print("BEGIN: calculate_mandatory_skill_scores")
    print("##################################")

    for i, job_json in enumerate(job_json_list, start=1):
        job_id = job_json.get("job_id", f"job_{i}")
        print(f"\n======================")
        print(f"PROCESSING JOB_ID: {job_id}")
        print(f"======================")
        score_dict = calculate_mandatory_skill_score(job_json, resume_json)
        results[job_id] = score_dict

    print("\n########## FINAL RESULTS ##########")
    for k, v in results.items():
        print(f"  job_id = {k} => {v}")
    print("###################################")

    return results


# -------------------- Example Usage --------------------
if __name__ == "__main__":
    job_json_list = [
        {
            "job_id": "Salesforce_Dev_Job",
            "mandatory": {
                "hard_skills": [
                    {
                        "skill": [
                            ["Apex"],
                            ["Salesforce development", "Apex"]
                        ],
                        "minyears": [4]
                    }
                ]
            },
        }
    ]

    # Resume: The empty job_id line has 2 years (with 1.0 similarity), not enough for min=4
    # So we'll skip that line, then accumulate coverage from real job_ids that sum to >= 4.
    resume_json = {
        "skills": [
            {
                "skill": ["Salesforce.com development", "Apex"],
                "years": 4,
                "job_id": ""  # perfect similarity but only 2 yrs
            },
            {
                "skill": ["Apex development", "Salesforce integrations"],
                "years": 3,
                "job_id": "MyPastJob|Company|2.0"
            },
            {
                "skill": ["Apex triggers", "Salesforce deployments"],
                "years": 3,
                "job_id": "MyPastJob|Company|3.0"
            }
        ]
    }

    results = calculate_mandatory_skill_scores(job_json_list, resume_json)
    print("\n=== DONE ===")


##################################
BEGIN: calculate_mandatory_skill_scores
##################################

PROCESSING JOB_ID: Salesforce_Dev_Job

BEGIN: calculate_skill_match_score

********** REQUIREMENT #1 **********

=== Processing Single Skill Requirement ===
Required skill: [['Apex'], ['Salesforce development', 'Apex']]
Min years required: 4

-- Sorted Skill Entries (desc by sim) --
 1) job_id='[EMPTY]', sim=1.000, yrs=4.00
 2) job_id='MyPastJob|Company|2.0', sim=0.861, yrs=3.00
 3) job_id='MyPastJob|Company|3.0', sim=0.826, yrs=3.00
Chose empty job_id skill alone: yrs=4, sim=1.000, coverage_used=4.00
=> Full coverage. Weighted sum = 1.000
[REQUIREMENT #1] Weighted Similarity Score = 1.000

=> Overall Mandatory Skill Match Score = 1.000

########## FINAL RESULTS ##########
  job_id = Salesforce_Dev_Job => {'mandatory_skill_score': 1.0}
###################################

=== DONE ===
