In [1]:
%%capture
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import json
import spacy
from functools import lru_cache
import torch
from utils.semantic_similarity import nlp_similarity_cached # import first
from utils.semantic_similarity import sentence_model # import second

current_wd = os.getcwd()

def safe_json_loads(val):
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {val}")
        return None

job_desc_data = pd.read_excel(os.path.join(current_wd, "data", "sample_des_extractions_test_final_3.25.25.xlsx"))
resume_data = pd.read_excel(os.path.join(current_wd, "data", "sample_res_extractions_final_3.18.25.xlsx"))

job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(safe_json_loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(safe_json_loads)

job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

def append_job_id_to_job_desc(row):
    job_id = row['id']
    job_desc = row['job_desc_parsed']

    if job_desc is None:
        return None
    
    if isinstance(job_desc, str):
        job_desc = safe_json_loads(job_desc)
        if job_desc is None:
            return None
    
    job_desc['job_id'] = job_id
    return job_desc

job_desc_data['job_desc_json'] = job_desc_data.apply(append_job_id_to_job_desc, axis=1)

In [2]:
def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

# Precompute Embeddings from a list of JSONS
def precompute_embeddings_for_json_list(json_list, batch_size=64):
    """
    Iterates over a list (or Series) of JSON objects, extracts all non-empty strings
    from each JSON, computes embeddings for all unique strings in batches, and returns
    a dictionary mapping each unique string to its embedding.
    
    Parameters:
      json_list (iterable): A list (or Series) of JSON objects (dicts).
      batch_size (int): Batch size for encoding.
      
    Returns:
      dict: A mapping from string to its embedding.
    """
    all_strings = set()
    for j in json_list:
        if j is None:
            continue
        strings = extract_all_strings_from_json(j)
        all_strings.update(strings)
    
    all_strings = list(all_strings)
    if all_strings:
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb

# Now call the function with your list of job description JSON objects:
embeddings = precompute_embeddings_for_json_list(job_desc_json, batch_size=64)

In [3]:
import json
from utils.safe_averages import safe_average

# ---------------------------
# FINAL OVERALL MATCH SCORE
# ---------------------------
def calculate_overall_match_score(job_json, resume_json,
                                  skill_weight=0.20,
                                  education_weight=0.20,
                                  responsibilities_weight=0.20,
                                  credentials_weight=0.20,
                                  background_weight=0.20):
    """
    Calculates the final overall match score as a safe average of the available section scores.
    If a section has no requirements, its score is returned as None and is ignored in the overall average.
    Sections include:
      - Skills
      - Education
      - Responsibilities
      - Credentials
      - Professional Background (the combined background/industry score)
    """
    skill_dict = calculate_overall_skill_match_score(job_json, resume_json)
    edu_dict = calculate_overall_education_match_score(job_json, resume_json)
    resp_dict = calculate_overall_responsibilities_match_score(job_json, resume_json)
    cred_dict = calculate_overall_credentials_score(job_json, resume_json)
    bg_dict = calculate_overall_background_score(job_json, resume_json)
    
    sections = []
    if skill_dict["overall_skill_score"] is not None:
        sections.append(skill_dict["overall_skill_score"])
    if edu_dict["overall_education_score"] is not None:
        sections.append(edu_dict["overall_education_score"])
    if resp_dict["overall_responsibilities_score"] is not None:
        sections.append(resp_dict["overall_responsibilities_score"])
    if cred_dict["overall_credentials_score"] is not None:
        sections.append(cred_dict["overall_credentials_score"])
    if bg_dict["overall_professional_background_score"] is not None:
        sections.append(bg_dict["overall_professional_background_score"])
    
    overall_match = safe_average(sections)
    job_id = job_json.get("id")
    
    final_result = {
        "job_id": job_id,
        "skills": {**skill_dict, "job_id": job_id},
        "education": {**edu_dict, "job_id": job_id},
        "responsibilities": {**resp_dict, "job_id": job_id},
        "credentials": {**cred_dict, "job_id": job_id},
        "professional_background": {**bg_dict, "job_id": job_id},
        "overall_match_score": overall_match
    }
    
    print(f"\nFinal Overall Match Score: {overall_match}")
    return final_result

# # ---------------------------
# # Example Usage:
# # (Assuming job_desc_json and resume_json are your lists/Series of job and resume JSON objects)
# result = calculate_overall_match_score(job_json=job_desc_json[47], resume_json=resume_json[2])
# print(result)

In [None]:
resultsA = nlp_similarity_cached("create customized salesforce applications", "developed custom applications")
resultsB = nlp_similarity_cached("programmed personalized applications", "developed custom applications")

print(resultsA)
print(resultsB)

0.8130134741465253
0.7504750887552899


#### Skill Matching

In [3]:
# SECTION 1: SKILLS MATCHING
from utils.mandatory_skill_score import calculate_mandatory_skill_scores
from utils.preferred_skill_score import calculate_preferred_skill_scores

mandatory_skill_scores = calculate_mandatory_skill_scores(job_json_list=job_desc_json, resume_json=resume_json[7])
preferred_skill_scores = calculate_preferred_skill_scores(job_json_list=job_desc_json, resume_json=resume_json[7])

print(mandatory_skill_scores)
print(preferred_skill_scores)

{'84b4a4d4-4fdb-45e8-bd40-c5679947f8b0': {'mandatory_skill_score': 0.5953992251996643}, 'be46fbc9-8e00-4381-a545-9ed71b079a3a': {'mandatory_skill_score': 0.6396935156413488}, '77773fdc-9c4e-4bdf-8e4a-2b82b08061b0': {'mandatory_skill_score': 0.45130528722490587}, '1154d66d-c922-48ea-bb60-59c719b3c77d': {'mandatory_skill_score': 0.6697338918844862}, 'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'mandatory_skill_score': 0.5970853567123414}, '60111acc-73d9-4fe8-8a84-c826d9eeeca4': {'mandatory_skill_score': 0.6772453884283703}, 'f0b34589-98f6-4b6a-a379-394e4e7f088f': {'mandatory_skill_score': 0.0}, 'e301b10c-7e2b-485e-8978-4ee2c9669953': {'mandatory_skill_score': 0.5811061342557273}, 'f12ad9e6-0c44-4c19-8f5d-37c2f3f14c18': {'mandatory_skill_score': 0.5575330257415774}, '9ff8b57c-9f01-448b-9ba9-d7d03b8db9dd': {'mandatory_skill_score': 0.7499877620827068}, '75a94499-c585-4181-b4de-09f1355a18a2': {'mandatory_skill_score': 0.7783831980493335}, 'd54b9f7b-d3ea-49b2-995c-c47d1a78576e': {'mandatory_skil

---- 
#### Responsibilities Matching

In [4]:
from utils.responsibilities_match_score import calculate_responsibilities_scores

responsibilities_match_scores = calculate_responsibilities_scores(job_json_list=job_desc_json, resume_json=resume_json[7])
print(responsibilities_match_scores)

{'84b4a4d4-4fdb-45e8-bd40-c5679947f8b0': {'responsibilities_score': 0.6304489033562799}, 'be46fbc9-8e00-4381-a545-9ed71b079a3a': {'responsibilities_score': 0.5818122070696621}, '77773fdc-9c4e-4bdf-8e4a-2b82b08061b0': {'responsibilities_score': 0.8091779373310232}, '1154d66d-c922-48ea-bb60-59c719b3c77d': {'responsibilities_score': 0.6812220322497098}, 'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'responsibilities_score': 0.7094292442003889}, '60111acc-73d9-4fe8-8a84-c826d9eeeca4': {'responsibilities_score': 0.6397129013950443}, 'f0b34589-98f6-4b6a-a379-394e4e7f088f': {'responsibilities_score': 0.6924689035306031}, 'e301b10c-7e2b-485e-8978-4ee2c9669953': {'responsibilities_score': 0.6674410634570653}, 'f12ad9e6-0c44-4c19-8f5d-37c2f3f14c18': {'responsibilities_score': 0.67803696791331}, '9ff8b57c-9f01-448b-9ba9-d7d03b8db9dd': {'responsibilities_score': 0.6810111679412701}, '75a94499-c585-4181-b4de-09f1355a18a2': {'responsibilities_score': 0.6699562479579262}, 'd54b9f7b-d3ea-49b2-995c-c47d1a78

----- 
#### Education Matching

In [5]:
from utils.mandatory_education_score import calculate_mandatory_education_scores
from utils.preferred_education_score import calculate_preferred_education_scores

mandatory_education_scores = calculate_mandatory_education_scores(job_json_list=job_desc_json, resume_json=resume_json[7])
preferred_education_scores = calculate_preferred_education_scores(job_json_list=job_desc_json, resume_json=resume_json[7])

print(mandatory_education_scores)
print(preferred_education_scores)

{'84b4a4d4-4fdb-45e8-bd40-c5679947f8b0': 0.0, 'be46fbc9-8e00-4381-a545-9ed71b079a3a': {'mandatory_education_score': None}, '77773fdc-9c4e-4bdf-8e4a-2b82b08061b0': {'mandatory_education_score': 1.0}, '1154d66d-c922-48ea-bb60-59c719b3c77d': {'mandatory_education_score': None}, 'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'mandatory_education_score': None}, '60111acc-73d9-4fe8-8a84-c826d9eeeca4': 0.0, 'f0b34589-98f6-4b6a-a379-394e4e7f088f': {'mandatory_education_score': None}, 'e301b10c-7e2b-485e-8978-4ee2c9669953': 0.0, 'f12ad9e6-0c44-4c19-8f5d-37c2f3f14c18': 0.0, '9ff8b57c-9f01-448b-9ba9-d7d03b8db9dd': {'mandatory_education_score': None}, '75a94499-c585-4181-b4de-09f1355a18a2': 0.0, 'd54b9f7b-d3ea-49b2-995c-c47d1a78576e': 0.0, 'edcd8df0-7dc7-4896-8d28-c435462059b6': {'mandatory_education_score': None}, '2fd8af11-4583-4d06-822b-fad7c0c3bc9a': 0.0, '8bd1bc07-2777-4c8e-89f8-7895e81b0abe': {'mandatory_education_score': 1.0}, '8aa2b635-edbf-4901-bb3f-4d4f2c69e7fe': {'mandatory_education_score': 

----- 
#### Credentials Matching

In [6]:
from utils.mandatory_credentials_score import calculate_mandatory_credentials_scores
from utils.preferred_credentials_score import calculate_preferred_credentials_scores

mandatory_credentials_scores = calculate_mandatory_credentials_scores(job_json_list=job_desc_json, resume_json=resume_json[7])
preferred_credentials_scores = calculate_preferred_credentials_scores(job_json_list=job_desc_json, resume_json=resume_json[7])

print(mandatory_credentials_scores)
print(preferred_credentials_scores)

{'84b4a4d4-4fdb-45e8-bd40-c5679947f8b0': {'mandatory_credentials_score': None}, 'be46fbc9-8e00-4381-a545-9ed71b079a3a': {'mandatory_credentials_score': None}, '77773fdc-9c4e-4bdf-8e4a-2b82b08061b0': {'mandatory_credentials_score': 0.0}, '1154d66d-c922-48ea-bb60-59c719b3c77d': {'mandatory_credentials_score': None}, 'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'mandatory_credentials_score': None}, '60111acc-73d9-4fe8-8a84-c826d9eeeca4': {'mandatory_credentials_score': None}, 'f0b34589-98f6-4b6a-a379-394e4e7f088f': {'mandatory_credentials_score': None}, 'e301b10c-7e2b-485e-8978-4ee2c9669953': {'mandatory_credentials_score': None}, 'f12ad9e6-0c44-4c19-8f5d-37c2f3f14c18': {'mandatory_credentials_score': None}, '9ff8b57c-9f01-448b-9ba9-d7d03b8db9dd': {'mandatory_credentials_score': 0.0}, '75a94499-c585-4181-b4de-09f1355a18a2': {'mandatory_credentials_score': None}, 'd54b9f7b-d3ea-49b2-995c-c47d1a78576e': {'mandatory_credentials_score': None}, 'edcd8df0-7dc7-4896-8d28-c435462059b6': {'mandatory_c

----- 
#### Background Matching

In [7]:
from utils.mandatory_background_score import calculate_mandatory_background_scores
from utils.preferred_background_score import calculate_preferred_background_scores

mandatory_background_scores = calculate_mandatory_background_scores(job_json_list=job_desc_json, resume_json=resume_json[7])
preferred_background_scores = calculate_preferred_background_scores(job_json_list=job_desc_json, resume_json=resume_json[7])

print(mandatory_background_scores)
print(preferred_background_scores)


{'84b4a4d4-4fdb-45e8-bd40-c5679947f8b0': {'mandatory_background_score': 0.0, 'mandatory_industry_score': 0.774607022603353}, 'be46fbc9-8e00-4381-a545-9ed71b079a3a': {'mandatory_background_score': None, 'mandatory_industry_score': None}, '77773fdc-9c4e-4bdf-8e4a-2b82b08061b0': {'mandatory_background_score': 0.6403924739360811, 'mandatory_industry_score': None}, '1154d66d-c922-48ea-bb60-59c719b3c77d': {'mandatory_background_score': 0.0, 'mandatory_industry_score': None}, 'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'mandatory_background_score': 0.6283577080567679, 'mandatory_industry_score': 0.0}, '60111acc-73d9-4fe8-8a84-c826d9eeeca4': {'mandatory_background_score': None, 'mandatory_industry_score': None}, 'f0b34589-98f6-4b6a-a379-394e4e7f088f': {'mandatory_background_score': 0.34746133198340745, 'mandatory_industry_score': None}, 'e301b10c-7e2b-485e-8978-4ee2c9669953': {'mandatory_background_score': 1.0, 'mandatory_industry_score': None}, 'f12ad9e6-0c44-4c19-8f5d-37c2f3f14c18': {'mandatory

---- 

#### Merge Scores into One Object

In [11]:
from utils.merge_scores import merge_scores_by_job_id

final_scores = merge_scores_by_job_id(mandatory_background_scores, 
                                      preferred_background_scores, 
                                      mandatory_education_scores,
                                      preferred_education_scores,
                                      mandatory_skill_scores,
                                      preferred_skill_scores,
                                      mandatory_credentials_scores,
                                      preferred_credentials_scores,
                                      responsibilities_match_scores,
                                      filter = True,
                                      threshold= 0.5
                                      )  



print(final_scores)

{'edcd8df0-7dc7-4896-8d28-c435462059b6': {'mandatory_background_score': None, 'mandatory_industry_score': None, 'preferred_background_score': None, 'preferred_industry_score': None, 'mandatory_education_score': None, 'preferred_education_score': None, 'mandatory_skill_score': 0.7860985994338991, 'preferred_skill_score': None, 'mandatory_credentials_score': None, 'preferred_credentials_score': None, 'responsibilities_score': 0.6937121603460542}, '2fd8af11-4583-4d06-822b-fad7c0c3bc9a': {'mandatory_background_score': None, 'mandatory_industry_score': None, 'preferred_background_score': None, 'preferred_industry_score': None, 'preferred_education_score': None, 'mandatory_skill_score': 0.5150486760669285, 'preferred_skill_score': None, 'mandatory_credentials_score': None, 'preferred_credentials_score': None, 'responsibilities_score': 0.5939985346738942}, '8bd1bc07-2777-4c8e-89f8-7895e81b0abe': {'mandatory_background_score': None, 'mandatory_industry_score': None, 'preferred_background_score

----

#### Overall Scores

In [12]:
from utils.overall_scores import make_overall_scores

results = make_overall_scores(final_scores)

In [13]:
results

[('44221648-dbaf-4b42-af68-0703ebb16835',
  {'mandatory_background_score': None,
   'mandatory_industry_score': None,
   'preferred_background_score': 1.0,
   'preferred_industry_score': 0.7384345531463624,
   'mandatory_education_score': None,
   'preferred_education_score': None,
   'mandatory_skill_score': 0.8346125748422413,
   'preferred_skill_score': 0.9009393200729835,
   'mandatory_credentials_score': None,
   'preferred_credentials_score': None,
   'responsibilities_score': 0.740603051402352,
   'overall_mandatory': 0.7876078131222966,
   'overall_preferred': 0.9504696600364917,
   'overall_score': 0.8690387365793941,
   'overall_skills': 0.8677759474576123,
   'overall_education': None,
   'overall_background': 1.0,
   'overall_credentials': None}),
 ('08d80a05-d86e-4fbf-9203-502dfad0f0f2',
  {'mandatory_background_score': None,
   'mandatory_industry_score': None,
   'preferred_background_score': 1.0,
   'preferred_industry_score': 0.9642459551493329,
   'mandatory_education

In [14]:
len(final_scores)

32

#### Eval

In [32]:
matched_jobids = [job_id for job_id, _ in results]

filtered_jobs = [
    j for j in job_desc_json
    if j.get("job_id") in matched_jobids
]

In [30]:
# Loop through all backgrounds in job_desc_json and print 
for job in job_desc_json:
    job_id = job.get("job_id")
    print(f"Job ID: {job_id}")

    # 1) Mandatory professional backgrounds
    mandatory_bgs = job.get("mandatory", {}).get("professional_background", [])
    for i, bg_req in enumerate(mandatory_bgs, start=1):
        print(f"  Mandatory Background #{i}: {bg_req}")

    # 2) Preferred professional backgrounds
    preferred_bgs = job.get("preferred", {}).get("professional_background", [])
    for i, bg_req in enumerate(preferred_bgs, start=1):
        print(f"  Preferred Background #{i}: {bg_req}")

Job ID: cc807c45-4a2f-4335-90d5-b50669459cc5
  Mandatory Background #1: {'industry': [], 'minyears': [8], 'background': [['Oracle Cloud ERP']]}
Job ID: 1400a854-77d4-4aab-a84c-9a643f100657
  Mandatory Background #1: {'industry': [], 'minyears': [8], 'background': [['relevant engineering hands-on work']]}
Job ID: 72f7a93e-824e-485f-b356-58c523fe65d0
  Mandatory Background #1: {'industry': ['financial services'], 'minyears': [5], 'background': [['Marketing Analytics'], ['similar practical experience']]}
  Preferred Background #1: {'industry': ['financial services', 'mortgage', 'credit card', 'personal loans', 'business loans'], 'minyears': [0], 'background': [['Marketing Analytics'], ['similar practical experience']]}
Job ID: ba72461b-56f5-4c1b-90b0-3c3b9ffa8a1c
  Mandatory Background #1: {'industry': [], 'minyears': [0], 'background': [["Bachelor's degree", 'computer science'], ['Software Engineering']]}
  Mandatory Background #2: {'industry': [], 'minyears': [0], 'background': [['3rd p