In [None]:
# %%capture
import os
import pandas as pd
import numpy as np
import json

# # Matching Algorithm Required Functions
from utils.semantic_similarity import sentence_model 

### Data Ingestion (To be Removed in Production)
current_wd = os.getcwd()

def append_job_id_to_job_desc(row):
    job_id = row['id']
    job_desc = row['job_desc_parsed']

    if job_desc is None:
        return None
    
    if isinstance(job_desc, str):
        job_desc = safe_json_loads(job_desc)
        if job_desc is None:
            return None
    
    job_desc['job_id'] = job_id
    return job_desc

def safe_json_loads(val, job_id=None):
    try:
        return json.loads(val)
    except json.JSONDecodeError:
        if job_id:
            print(f"Error parsing JSON for job_id {job_id}: {val}")
        else:
            print(f"Error parsing JSON: {val}")
        return None


job_desc_data = pd.read_excel(os.path.join(current_wd, "data", "10k_job_descriptions_labeled.xlsx"))
resume_data = pd.read_excel(os.path.join(current_wd, "data", "sample_res_extractions_final_3.26.25.xlsx"))

job_desc_data["job_desc_parsed"] = job_desc_data["extracted"].apply(safe_json_loads)
resume_data["resume_parsed"] = resume_data["extracted"].apply(safe_json_loads)

job_desc_data = job_desc_data[job_desc_data["job_desc_parsed"].apply(lambda x: x is not None)]
resume_data = resume_data[resume_data["resume_parsed"].apply(lambda x: x is not None)]

job_desc_json = job_desc_data['job_desc_parsed']
resume_json = resume_data['resume_parsed']

job_desc_data["job_desc_json"] = job_desc_data.apply(
    lambda row: safe_json_loads(row["extracted"], job_id=row["id"]),
    axis=1
)

def extract_all_strings_from_json(j):
    """
    Recursively extracts all non-empty strings from a JSON-like object (dict or list).
    """
    result = []
    if isinstance(j, dict):
        for v in j.values():
            result.extend(extract_all_strings_from_json(v))
    elif isinstance(j, list):
        for item in j:
            if isinstance(item, str):
                s = item.strip()
                if s:
                    result.append(s)
            else:
                result.extend(extract_all_strings_from_json(item))
    return result

def precompute_embeddings_for_json_list(json_list, batch_size=64):
    """
    Iterates over a list (or Series) of JSON objects, extracts all non-empty strings
    from each JSON, computes embeddings for all unique strings in batches, and returns
    a dictionary mapping each unique string to its embedding.
    
    Parameters:
      json_list (iterable): A list (or Series) of JSON objects (dicts).
      batch_size (int): Batch size for encoding.
      
    Returns:
      dict: A mapping from string to its embedding.
    """
    all_strings = set()
    for j in json_list:
        if j is None:
            continue
        strings = extract_all_strings_from_json(j)
        all_strings.update(strings)
    
    all_strings = list(all_strings)
    if all_strings:
        embeddings = sentence_model.encode(all_strings, batch_size=batch_size, convert_to_tensor=False)
    else:
        embeddings = np.zeros((0, sentence_model.get_sentence_embedding_dimension()))
    
    string_to_emb = {s: emb for s, emb in zip(all_strings, embeddings)}
    return string_to_emb

embeddings = precompute_embeddings_for_json_list(job_desc_json, batch_size=64)

#### End of Data Ingestion (To be Removed in Production)


Error parsing JSON: {"details":{"
Error parsing JSON: {"details":{"wage":[{"max":146875,"min":81250,"pay_type":"Salary","wage_city":[],"wage_state":[]}],"benefits":{"fsa":false,"hsa":false,"bonus":false,"other":[],"dental":false,"equity":false,"vision":false,"medical":false,"401k_match":false,"mental_health":false,"unlimited_pto":false,"tuition_reimbursement":false},"location":[],"job_title":["HRSTS Oracle Software Engineer"],"tax_terms":["Direct-hire"],"wfh_policy":["Remote"],"company_name":["Leidos"],"company_stage":[],"work_schedule":[],"job_title_base":["Oracle Software Engineer"],"employment_type":["Full-time"],"travel_required":{"required":false,"hours_weekly":0},"company_industry":[],"experience_level":["Senior"],"work_authorization":["Does Not Offer Sponsorship"]},"mandatory":{"education":[{"field_of_study":[],"education_level":["Bachelor's","Master's"]}],"credentials":[{"credential":["Security+"]}],"hard_skills":[{"skill":["IT working with application systems","a medium to lar

In [None]:
job_desc_json[1].get('details', {}).get('job_title_base', [])

# Assume your JSON data is in a variable named 'job_desc_json', as a list of dicts
job_title_bases = []

for job in job_desc_json:
    if job is None:
        continue
    base_titles = job.get('details', {}).get('job_title_base', [])
    if base_titles:
        job_title_bases.append(base_titles[0])  # taking the first element

# Convert to a DataFrame for convenience
df_titles = pd.DataFrame(job_title_bases, columns=['job_title_base'])

import matplotlib.pyplot as plt
import seaborn as sns

title_counts = df_titles['job_title_base'].value_counts().reset_index()
title_counts.columns = ['job_title_base', 'count']

# Limit to top 20 titles for clarity
top_titles = title_counts.head(20)

plt.figure(figsize=(12, 8))
sns.barplot(data=top_titles, y='job_title_base', x='count', palette='viridis')

plt.title('Distribution of Top 20 Job Titles')
plt.xlabel('Count')
plt.ylabel('Job Title Base')

plt.tight_layout()
plt.show()

['Senior Software Engineer']

In [29]:
job_desc_json

0       {'details': {'wage': [{'max': 0, 'min': 0, 'pa...
1       {'details': {'wage': [], 'benefits': {'fsa': F...
2       {'details': {'wage': [], 'benefits': {'fsa': F...
3       {'details': {'wage': [], 'benefits': {'fsa': F...
4       {'details': {'wage': [], 'benefits': {'fsa': F...
                              ...                        
9995    {'details': {'wage': [{'max': 0, 'min': 0, 'pa...
9996    {'details': {'wage': [{'max': 222000, 'min': 8...
9997    {'details': {'wage': [{'max': 155000, 'min': 1...
9998    {'details': {'wage': [{'max': 95000, 'min': 62...
9999    {'details': {'wage': [], 'benefits': {'fsa': F...
Name: job_desc_parsed, Length: 10000, dtype: object

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch 

def flatten_list(nested_list):
    """Flatten nested lists into a single list."""
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

def find_similar_jobs(job_desc_json, input_job_title, similarity_threshold=0.9):
    input_embedding = sentence_model.encode(input_job_title, convert_to_tensor=True)

    matching_jobs = []

    for job in job_desc_json:
        if job is None:
            continue
        job_title_base_list = flatten_list(job['details'].get('job_title_base', []))

        if not job_title_base_list:
            continue

        job_embeddings = sentence_model.encode(job_title_base_list, convert_to_tensor=True)
        similarities = util.cos_sim(input_embedding, job_embeddings)
        max_similarity = torch.max(similarities).item()

        if max_similarity >= similarity_threshold:
            matching_jobs.append({
                "job": job,
                "similarity": max_similarity
            })

    matching_jobs = sorted(matching_jobs, key=lambda x: x['similarity'], reverse=True)
    return matching_jobs

# Usage example:
input_job_title = "Software Engineer"
results = find_similar_jobs(job_desc_json, input_job_title)

# Extract only the JSON job descriptions:
matched_jobs_json_only = [match['job'] for match in results]

# Check your results clearly:
print(f"Number of matched jobs: {len(matched_jobs_json_only)}")
for job in matched_jobs_json_only[:5]:  # First 5 jobs as example
    print(job['details']['job_title_base'])

In [64]:
# Convert directly to Pandas Series
results_series = pd.Series(results)

# If you only want the JSON job descriptions:
matched_jobs_json_only = results_series.apply(lambda x: x['job'])

# Display the Series:
print(matched_jobs_json_only)

TypeError: tuple indices must be integers or slices, not str

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch 

def flatten_list(nested_list):
    """Helper function to flatten a nested list of strings."""
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

def find_similar_jobs(job_desc_json, input_job_title, similarity_threshold=0.8):
    input_embedding = sentence_model.encode(input_job_title, convert_to_tensor=True)

    matching_jobs = []

    for job in job_desc_json:
        if job is None:
            continue
        job_title_base_list = job['details'].get('job_title_base', [])
        
        # Flatten nested lists explicitly
        job_title_base_list = flatten_list(job_title_base_list)

        if not job_title_base_list:
            continue
        
        # Encode all titles individually
        job_embeddings = sentence_model.encode(job_title_base_list, convert_to_tensor=True)

        # Compute cosine similarity (input vs. each title), take maximum
        similarities = util.cos_sim(input_embedding, job_embeddings)
        max_similarity = torch.max(similarities).item()

        if max_similarity >= similarity_threshold:
            matching_jobs.append({
                "job": job,
                "similarity": max_similarity
            })

    matching_jobs = sorted(matching_jobs, key=lambda x: x['similarity'], reverse=True)

    return matching_jobs

# Example usage
input_job_title = "Software Engineer"
results = find_similar_jobs(job_desc_json, input_job_title)

# Print results
for match in results:
    print(match['job']['details']['job_title_base'], match['similarity'])


[['Java Developer'], ['Software Engineer']] 1.000000238418579
[['Bigdata Engineer'], ['Software Engineer'], ['System Analyst']] 1.0000001192092896
['Software Engineer', 'Senior Data Analysis Engineer', 'Senior Architect Engineer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Informatica Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Full Stack Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch 

def flatten_list(nested_list):
    """Helper function to flatten a nested list of strings."""
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

def find_similar_jobs(job_desc_json, input_job_title, similarity_threshold=0.8):
    input_embedding = sentence_model.encode(input_job_title, convert_to_tensor=True)

    matching_jobs = []

    for job in job_desc_json:
        if job is None:
            continue
        job_title_base_list = job['details'].get('job_title_base', [])
        
        # Flatten nested lists explicitly
        job_title_base_list = flatten_list(job_title_base_list)

        if not job_title_base_list:
            continue
        
        # Encode all titles individually
        job_embeddings = sentence_model.encode(job_title_base_list, convert_to_tensor=True)

        # Compute cosine similarity (input vs. each title), take maximum
        similarities = util.cos_sim(input_embedding, job_embeddings)
        max_similarity = torch.max(similarities).item()

        if max_similarity >= similarity_threshold:
            matching_jobs.append({
                "job": job,
                "similarity": max_similarity
            })

    matching_jobs = sorted(matching_jobs, key=lambda x: x['similarity'], reverse=True)

    return matching_jobs

# Example usage
input_job_title = "Software Engineer"
results = find_similar_jobs(job_desc_json, input_job_title)

# Print results
for match in results:
    print(match['job']['details']['job_title_base'], match['similarity'])


[['Java Developer'], ['Software Engineer']] 1.000000238418579
[['Bigdata Engineer'], ['Software Engineer'], ['System Analyst']] 1.0000001192092896
['Software Engineer', 'Senior Data Analysis Engineer', 'Senior Architect Engineer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Informatica Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Full Stack Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch 

def flatten_list(nested_list):
    """Helper function to flatten a nested list of strings."""
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

def find_similar_jobs(job_desc_json, input_job_title, similarity_threshold=0.8):
    input_embedding = sentence_model.encode(input_job_title, convert_to_tensor=True)

    matching_jobs = []

    for job in job_desc_json:
        if job is None:
            continue
        job_title_base_list = job['details'].get('job_title_base', [])
        
        # Flatten nested lists explicitly
        job_title_base_list = flatten_list(job_title_base_list)

        if not job_title_base_list:
            continue
        
        # Encode all titles individually
        job_embeddings = sentence_model.encode(job_title_base_list, convert_to_tensor=True)

        # Compute cosine similarity (input vs. each title), take maximum
        similarities = util.cos_sim(input_embedding, job_embeddings)
        max_similarity = torch.max(similarities).item()

        if max_similarity >= similarity_threshold:
            matching_jobs.append({
                "job": job,
                "similarity": max_similarity
            })

    matching_jobs = sorted(matching_jobs, key=lambda x: x['similarity'], reverse=True)

    return matching_jobs

# Example usage
input_job_title = "Software Engineer"
results = find_similar_jobs(job_desc_json, input_job_title)

# Print results
for match in results:
    print(match['job']['details']['job_title_base'], match['similarity'])


[['Java Developer'], ['Software Engineer']] 1.000000238418579
[['Bigdata Engineer'], ['Software Engineer'], ['System Analyst']] 1.0000001192092896
['Software Engineer', 'Senior Data Analysis Engineer', 'Senior Architect Engineer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Informatica Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Full Stack Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch 

def flatten_list(nested_list):
    """Helper function to flatten a nested list of strings."""
    flattened = []
    for item in nested_list:
        if isinstance(item, list):
            flattened.extend(flatten_list(item))
        else:
            flattened.append(item)
    return flattened

def find_similar_jobs(job_desc_json, input_job_title, similarity_threshold=0.8):
    input_embedding = sentence_model.encode(input_job_title, convert_to_tensor=True)

    matching_jobs = []

    for job in job_desc_json:
        if job is None:
            continue
        job_title_base_list = job['details'].get('job_title_base', [])
        
        # Flatten nested lists explicitly
        job_title_base_list = flatten_list(job_title_base_list)

        if not job_title_base_list:
            continue
        
        # Encode all titles individually
        job_embeddings = sentence_model.encode(job_title_base_list, convert_to_tensor=True)

        # Compute cosine similarity (input vs. each title), take maximum
        similarities = util.cos_sim(input_embedding, job_embeddings)
        max_similarity = torch.max(similarities).item()

        if max_similarity >= similarity_threshold:
            matching_jobs.append({
                "job": job,
                "similarity": max_similarity
            })

    matching_jobs = sorted(matching_jobs, key=lambda x: x['similarity'], reverse=True)

    return matching_jobs

# Example usage
input_job_title = "Software Engineer"
results = find_similar_jobs(job_desc_json, input_job_title)

# Print results
for match in results:
    print(match['job']['details']['job_title_base'], match['similarity'])


[['Java Developer'], ['Software Engineer']] 1.000000238418579
[['Bigdata Engineer'], ['Software Engineer'], ['System Analyst']] 1.0000001192092896
['Software Engineer', 'Senior Data Analysis Engineer', 'Senior Architect Engineer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Informatica Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer', 'Full Stack Developer'] 1.0000001192092896
['Software Engineer', 'Senior Software Engineer'] 1.0000001192092896
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 1.0
['Software Engineer'] 

In [28]:
resume_json[4]

{'skills': [{'skill': ['Used Azure Logic Apps',
    'implement automated workflows',
    'integration of data and services'],
   'years': 2.5,
   'job_id': 'AzureCloudEngineer|CapitalOneBank|2.5'},
  {'skill': ['Worked with Azure Event Driven Patterns',
    'Azure Service Bus',
    'pub/sub model'],
   'years': 2.5,
   'job_id': 'AzureCloudEngineer|CapitalOneBank|2.5'},
  {'skill': ['Worked on Azure AD',
    'PowerBI',
    'embedding PowerBI dashboards',
    'webpage'],
   'years': 2.5,
   'job_id': 'AzureCloudEngineer|CapitalOneBank|2.5'},
  {'skill': ['Setting up Azure VM server',
    'migrating the AWS application',
    'Azure',
    'writing CI/CD pipelines',
    'Azure DevOps'],
   'years': 2.5,
   'job_id': 'AzureCloudEngineer|CapitalOneBank|2.5'},
  {'skill': ['Written Python Flask APIs', 'Fast APIs', 'CRUD operations'],
   'years': 2.5,
   'job_id': 'AzureCloudEngineer|CapitalOneBank|2.5'},
  {'skill': ['Used APIM',
    'securing',
    'managing the usage',
    'error handling o