In [1]:
%%capture
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import json
import spacy
from functools import lru_cache
import torch

# Matching Algorithm Required Functions
from utils.semantic_similarity import nlp_similarity_cached 
from utils.semantic_similarity import sentence_model 
from utils.mandatory_skill_score import calculate_mandatory_skill_scores
from utils.preferred_skill_score import calculate_preferred_skill_scores
from utils.responsibilities_match_score import calculate_responsibilities_scores
from utils.mandatory_education_score import calculate_mandatory_education_scores
from utils.preferred_education_score import calculate_preferred_education_scores
from utils.mandatory_credentials_score import calculate_mandatory_credentials_scores
from utils.preferred_credentials_score import calculate_preferred_credentials_scores
from utils.mandatory_background_score import calculate_mandatory_background_scores
from utils.preferred_background_score import calculate_preferred_background_scores
from utils.merge_scores import merge_scores_by_job_id

### Data Ingestion (To be Removed in Production)
current_wd = os.getcwd()

def safe_json_loads(val):
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {val}")
        return None

job_desc_data = pd.read_excel(os.path.join(current_wd, "data", "sample_des_extractions_test_final_3.25.25.xlsx"))
resume_data = pd.read_excel(os.path.join(current_wd, "data", "sample_res_extractions_final_3.26.25.xlsx"))

job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(safe_json_loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(safe_json_loads)

job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

def append_job_id_to_job_desc(row):
    job_id = row['id']
    job_desc = row['job_desc_parsed']

    if job_desc is None:
        return None
    
    if isinstance(job_desc, str):
        job_desc = safe_json_loads(job_desc)
        if job_desc is None:
            return None
    
    job_desc['job_id'] = job_id
    return job_desc

job_desc_data['job_desc_json'] = job_desc_data.apply(append_job_id_to_job_desc, axis=1)

def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

def precompute_embeddings_for_json_list(json_list, batch_size=64):
    """
    Iterates over a list (or Series) of JSON objects, extracts all non-empty strings
    from each JSON, computes embeddings for all unique strings in batches, and returns
    a dictionary mapping each unique string to its embedding.
    
    Parameters:
      json_list (iterable): A list (or Series) of JSON objects (dicts).
      batch_size (int): Batch size for encoding.
      
    Returns:
      dict: A mapping from string to its embedding.
    """
    all_strings = set()
    for j in json_list:
        if j is None:
            continue
        strings = extract_all_strings_from_json(j)
        all_strings.update(strings)
    
    all_strings = list(all_strings)
    if all_strings:
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb

embeddings = precompute_embeddings_for_json_list(job_desc_json, batch_size=64)

#### End of Data Ingestion (To be Removed in Production)


In [None]:
canidate_resume_JSON  = resume_json[7]
job_desc_json_lst = job_desc_json[0:100]

print(f"Total Length of Sample: {len(job_desc_json_lst)}")

# Run Match Score 
# Stage 1: 
# 1.1. Calculate credentials scores
mandatory_credentials_scores = calculate_mandatory_credentials_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)
preferred_credentials_scores = calculate_preferred_credentials_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)

# 1.2. Calculate education scores
mandatory_education_scores = calculate_mandatory_education_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)
preferred_education_scores = calculate_preferred_education_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)

# 1.3. Calculate background scores
mandatory_background_scores = calculate_mandatory_background_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)
preferred_background_scores = calculate_preferred_background_scores(job_json_list=job_desc_json_lst, resume_json=canidate_resume_JSON)

# Stage 1.5: Filter after the first stage
stage1_scores = merge_scores_by_job_id(mandatory_background_scores, 
                                      preferred_background_scores, 
                                      mandatory_education_scores,
                                      preferred_education_scores,
                                      mandatory_credentials_scores,
                                      preferred_credentials_scores,
                                      filter = True,
                                      threshold= 0.5
                                      ) 

stage_job_desc_json_lst = [job for job in job_desc_json_lst if job.get('job_id') in set(stage_scores.keys())]

print(f"Total Length of Sample after Stage 1: {len(stage_job_desc_json_lst)}")

# Stage 2:
# 2.1. Calculate responsibilities scores
mandatory_responsibilities_scores = calculate_responsibilities_scores(job_json_list=stage_job_desc_json_lst, resume_json=canidate_resume_JSON)

# 2.2. Calculate skills scores
mandatory_skills_scores = calculate_mandatory_skill_scores(job_json_list=stage_job_desc_json_lst, resume_json=canidate_resume_JSON)
preferred_skills_scores = calculate_preferred_skill_scores(job_json_list=stage_job_desc_json_lst, resume_json=canidate_resume_JSON)

# Stage 2.5: Filter after the second stage
stage2_scores = merge_scores_by_job_id(stage1_scores,
                                    mandatory_responsibilities_scores, 
                                      mandatory_skills_scores, 
                                      preferred_skills_scores,
                                      filter = True,
                                      threshold= 0.5
                                      )



Total Length of Sample: 100
Total Length of Sample after Stage 1: 34


In [3]:
final_scores

{'cbbc9bbc-2f21-451f-ac20-01e4f2c611e1': {'mandatory_background_score': 0.7051542997360233,
  'mandatory_industry_score': 0.6039740244547528,
  'preferred_background_score': None,
  'preferred_industry_score': None,
  'mandatory_education_score': None,
  'preferred_education_score': None,
  'mandatory_credentials_score': None,
  'preferred_credentials_score': None},
 'edcd8df0-7dc7-4896-8d28-c435462059b6': {'mandatory_background_score': None,
  'mandatory_industry_score': None,
  'preferred_background_score': None,
  'preferred_industry_score': None,
  'mandatory_education_score': None,
  'preferred_education_score': None,
  'mandatory_credentials_score': None,
  'preferred_credentials_score': None},
 '2fd8af11-4583-4d06-822b-fad7c0c3bc9a': {'mandatory_background_score': None,
  'mandatory_industry_score': None,
  'preferred_background_score': None,
  'preferred_industry_score': None,
  'mandatory_education_score': 0.7307955225308739,
  'preferred_education_score': None,
  'mandatory_c

In [4]:
job_desc_json[0]

{'details': {'wage': [{'max': 120000,
    'min': 100000,
    'pay_type': 'Salary',
    'wage_city': [],
    'wage_state': []}],
  'benefits': {'fsa': False,
   'hsa': False,
   'bonus': False,
   'other': [],
   'dental': False,
   'equity': False,
   'vision': False,
   'medical': False,
   '401k_match': False,
   'mental_health': False,
   'unlimited_pto': False,
   'tuition_reimbursement': False},
  'location': [{'city': '', 'state': '', 'country': 'Remote'}],
  'job_title': ['Sr. Ab Initio Developer'],
  'tax_terms': ['Direct-hire'],
  'wfh_policy': ['Remote'],
  'company_name': [],
  'company_stage': [],
  'work_schedule': [],
  'job_title_base': ['Senior Ab Initio Developer'],
  'employment_type': ['Full-time'],
  'travel_required': {'required': False, 'hours_weekly': 0},
  'company_industry': [],
  'experience_level': ['Senior'],
  'work_authorization': []},
 'mandatory': {'education': [{'field_of_study': ['Computer Science',
     'Related'],
    'education_level': ["Bachelor's"

In [6]:
filtered_jobs = [job for job in job_desc_json if job.get('job_id') in set(final_scores.keys())]

filtered_jobs

[{'details': {'wage': [],
   'benefits': {'fsa': False,
    'hsa': False,
    'bonus': False,
    'other': [],
    'dental': False,
    'equity': False,
    'vision': False,
    'medical': False,
    '401k_match': False,
    'mental_health': False,
    'unlimited_pto': False,
    'tuition_reimbursement': False},
   'location': [{'city': 'Windsor', 'state': 'CT', 'country': 'US'}],
   'job_title': ['Salesforce Business Analyst with SCM/ logistics domain Experience'],
   'tax_terms': ['Direct-hire contract',
    'Contract Corp-to-Corp',
    'Contract W2',
    'Full-time'],
   'wfh_policy': ['Onsite'],
   'company_name': ['Techgene'],
   'company_stage': [],
   'work_schedule': [],
   'job_title_base': ['Salesforce Business Analyst'],
   'employment_type': [],
   'travel_required': {'required': False, 'hours_weekly': 0},
   'company_industry': [],
   'experience_level': [],
   'work_authorization': []},
  'mandatory': {'education': [],
   'credentials': [],
   'hard_skills': [{'skill': [[

In [7]:
len(filtered_jobs)

34