In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from dotenv import load_dotenv
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from src.train import get_device, train_job_classifier, clear_cache
from src.data import clean_text
from sklearn.metrics import classification_report
from typing import Tuple, Any
import numpy as np
import re
import json 
import os


load_dotenv(dotenv_path='.env', override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

# Load or get cached dataset

In [None]:
dataset: pd.DataFrame = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    'jatinchawda/job-titles-and-description',
    'clean_data.parquet',
    # According to the task, dataset consists of a single column: job_title
    pandas_kwargs={'columns': ['job_title']} 
)

# I will take 35% sample of data to work with 
dataset = dataset.sample(frac=0.35, random_state=30)

dataset.head()

Unnamed: 0,job_title
1412234,Senior Research Ontologist
102919,Business Development Representative
651406,Senior UI Developer
1019552,Patient Services Coordinator- Mercy Health St....
1083675,Warehouse / Storeperson


In [33]:
print('Dataset size:', len(dataset), 'Unique titles:', dataset['job_title'].nunique())

Dataset size: 495437 Unique titles: 373932


In [34]:
print('Number of NULL titles:', dataset.isna().sum().item())

Number of NULL titles: 1


In [35]:
print('Number of empty titles:', (dataset['job_title'].str.len() == 0).sum())


Number of empty titles: 0


-	Some job titles are duplicated, so we removed the duplicates to avoid redundancy.
- There are null values in the dataset, which were also dropped to ensure data quality.

In [36]:
dataset = dataset.drop_duplicates().dropna()

# Basic EDA

In [37]:
# Calculate job title lengths
dataset['title_length'] = dataset['job_title'].apply(lambda x: len(x.split()))
dataset['title_length'].describe()

count    373932.00000
mean          5.50763
std           9.66862
min           1.00000
25%           3.00000
50%           5.00000
75%           7.00000
max        1057.00000
Name: title_length, dtype: float64

In [40]:
dataset[dataset['title_length'] > 150]

Unnamed: 0,job_title,title_length
56026,SNI Companies is looking for a Financial Analy...,207
56137,DescriptionThe Senior Cyber Strategic Intellig...,861
56093,"Location: Jacksonville, FL\nLocation: Remote\n...",518
55875,Role:\n\nThis position is responsible for mana...,246
56080,It's an exciting time at The Main Street Ameri...,395
...,...,...
55858,Job Profile Summary\n\nCAI is hiring!... We ar...,333
56179,"""This role is currently work-from-home and wil...",633
55851,Apply\n\nDescription\n\nWe are the leading Fin...,704
55969,The IT Business Lead Analyst is a senior-level...,457


- On average, job titles contain around 5-7 words, providing sufficient information for classification.
- However, there are some outliers in the dataset:
    - Some job titles consist of only one word, which may lack context and lead to misclassification.
    - Some job titles contain an excessive number of words (>150), which may introduce noise and unnecessary information.
- These variations highlight the importance of data cleaning and preprocessing before generating embeddings to ensure better model performance.

In [41]:
def preprocess_text_temp(text: str) -> str:
    # Remove special characters and numbers and remove extra space
    return re.sub(r'[^A-Za-z\s]+|\s+', ' ', text).strip()


dataset['cleaned_job_title'] = dataset['job_title'].apply(preprocess_text_temp)

In [45]:
pd.options.display.max_rows = 150
dataset[dataset['cleaned_job_title'].str.len() == 0]

Unnamed: 0,job_title,title_length,cleaned_job_title,is_numeric_only
1310666,电测试工程师,1,,False
1004562,业务建模顾问/经理-银行方向,1,,False
1081715,三四线城市零售销售 温州,2,,False
1195206,ΕΡΓΑΤΗΣ,1,,False
1173005,货物转员工(收货组/月薪制),1,,False
763638,基层医疗事业部-市场-上海,1,,False
1361769,Ведущий системный администратор,3,,False
138646,物安全评估员,1,,False
1304539,2021-30665,1,,True
138518,业务分员,1,,False


- During analysis, it was discovered that the dataset is multilingual, containing job titles in Chinese, Ukrainian, Russian, and other languages, which could pose a challenge for classification.
- Non-English job titles (e.g., Chinese, Russian) may require translation to a common language or filtering if they are not relevant.
- Non-job titles appear in the dataset and need to be removed as they do not provide meaningful information.
- Special characters & mixed alphabets require cleaning to ensure consistent text representation before embedding generation.
- Addressing these issues through language detection, text normalization, and filtering is essential for improving the quality of embeddings and overall model accuracy.

To improve the quality of job title classification, invalid job titles need to be identified and removed. 

In [46]:
def is_only_numbers_or_symbols(text: str) -> bool:
    """Returns True if the text contains only numbers or special symbols, but keeps non-English words."""
    return bool(re.fullmatch(r'[\d\W]+', text.strip()))


dataset['is_numeric_only'] = dataset['job_title'].apply(is_only_numbers_or_symbols)

In [47]:
dataset[dataset['is_numeric_only'] == True]

Unnamed: 0,job_title,title_length,cleaned_job_title,is_numeric_only
1304539,2021-30665,1,,True
383088,.,1,,True
1266298,2021-4291,1,,True
559850,11599,1,,True
550266,2021-28268,1,,True
488994,2021-27545,1,,True
1304589,2021-30903,1,,True
309753,1332600-4202,1,,True
985615,2021-1323,1,,True
1304545,2021-31353,1,,True


We can remove these job titles because they do not contain meaningful information and cannot be reliably classified.

In [48]:
dataset = dataset[dataset['is_numeric_only'] == False]

In [49]:
def has_non_english_words(text: str) -> bool:
    """
    Detects if a string contains words that are not English.
    - Ignores symbols and numbers.
    - Returns True if at least one non-English word is found.
    """
    # Remove all symbols and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text).strip()
    
    # If the cleaned text is empty, assume it's non-English (e.g., "数据科学家" becomes empty)
    if not text:
        return True
    
    return False


dataset['non_english'] = dataset['job_title'].apply(has_non_english_words)

In [50]:
dataset[dataset['non_english'] == True]

Unnamed: 0,job_title,title_length,cleaned_job_title,is_numeric_only,non_english
1310666,电测试工程师,1,,False,True
1004562,业务建模顾问/经理-银行方向,1,,False,True
1081715,三四线城市零售销售 温州,2,,False,True
1195206,ΕΡΓΑΤΗΣ,1,,False,True
1173005,货物转员工(收货组/月薪制),1,,False,True
763638,基层医疗事业部-市场-上海,1,,False,True
1361769,Ведущий системный администратор,3,,False,True
138646,物安全评估员,1,,False,True
138518,业务分员,1,,False,True
1273365,基层医疗事业部-地区-山东临沂市,1,,False,True


- We can process non-English job titles in two ways:
- First approach: Translate them into English and use the translated version as the job_title. This ensures consistency in language and may improve classification performance.
- Second approach: Use a multilingual embedding model, such as paraphrase-multilingual-mpnet-base-v2, which has been trained on multiple languages. This allows the model to directly understand and classify job titles in their original language without translation.

- I will stick with the second approach, using a multilingual embedding model, as it is easier to implement and less error-prone than translation.
- Before generating embeddings, the text needs preprocessing to remove HTML tags, escaped characters, and unnecessary formatting while keeping the structure intact.
- Punctuation will not be removed, as it may contain important contextual information that affects embeddings and classification accuracy.

In [51]:
dataset[["cleaned_job_title", "changes_applied"]] = dataset["job_title"].apply(lambda x: pd.Series(clean_text(x)))

In [52]:
dataset[dataset['changes_applied'] == True]

Unnamed: 0,job_title,title_length,cleaned_job_title,is_numeric_only,non_english,changes_applied
855158,"VP, Sales &amp; Strategic Accounts, Revenue Cy...",9,"VP, Sales & Strategic Accounts, Revenue Cycle ...",False,False,True
1130116,"Senior Officer, SWIFT operations (Night shift)...",12,"Senior Officer, SWIFT operations (Night shift)...",False,False,True
736493,Press Manager,2,Press Manager,False,False,True
255423,CIB-Securities Services Operations Data Manag...,6,CIB-Securities Services Operations Data Manage...,False,False,True
887863,Management &amp; Sales Training Program,5,Management & Sales Training Program,False,False,True
...,...,...,...,...,...,...
1220799,FOOD SERVICE UTILITY CGJ (FULL &amp; PART-TIME),7,FOOD SERVICE UTILITY CGJ (FULL & PART-TIME),False,False,True
347465,"Director, Display &amp; Social",4,"Director, Display & Social",False,False,True
724642,"Director, Brand Strategy &amp; Marketing",5,"Director, Brand Strategy & Marketing",False,False,True
652166,Design Engineers – All Levels (Electrical / EL...,14,Design Engineers – All Levels (Electrical / EL...,False,False,True



| Model | Pros | How to Use |
|---|---|---|
| [Sentence-BERT](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | ✅ High-quality sentence embeddings ✅ Supports 50+ languages ✅ Efficient |	sentence-transformers |
| [Language-Agnostic BERT](https://huggingface.co/sentence-transformers/LaBSE) | ✅ Supports 109 languages ✅ Best for semantic search & clustering | sentence-transformers
| [Facebook Multilingual Word Embeddings](https://fasttext.cc/) |	✅ Fastest ✅ Works well for word-level embeddings (but not full sentences) |	fasttext
| [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings) |	✅ Extremely high quality ✅ Handles multiple languages well ✅ API-based (paid) | OpenAI API |


In [78]:
device = get_device()
print('Used device:', device)

Used device: mps


In [64]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)

In [None]:
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


job_titles = dataset['cleaned_job_title'].tolist()

batch_size = 512 
embeddings = []

# Process in batches to avoid memory issues
for i in tqdm(range(0, len(job_titles), batch_size), desc='Encoding Batches'):
    batch = job_titles[i : i + batch_size]
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = sentence_embeddings.cpu().numpy()
    embeddings.append(sentence_embeddings)


embeddings = np.vstack(embeddings)


# Delete the model and free memory
clear_cache(tokenizer, model)

Encoding Batches: 100%|██████████| 731/731 [2:29:06<00:00, 12.24s/it]  


0

In [68]:
embeddings.shape

(373912, 768)

In [72]:
# Save embeddings
dataset['embedding'] = list(embeddings)

os.makedirs('./cache', exist_ok=True)
dataset[['job_title', 'cleaned_job_title', 'embedding']].to_parquet(
    './cache/data_with_embeddings.parquet'
)

dataset.head()

Unnamed: 0,job_title,title_length,cleaned_job_title,is_numeric_only,non_english,changes_applied,embedding
1412234,Senior Research Ontologist,3,Senior Research Ontologist,False,False,False,"[-0.00582021, 0.27743036, -0.011652084, 0.0017..."
102919,Business Development Representative,3,Business Development Representative,False,False,False,"[-0.090618886, 0.14480697, -0.0056854617, 0.05..."
651406,Senior UI Developer,3,Senior UI Developer,False,False,False,"[-0.0669652, 0.21352673, -0.0032302842, 0.0349..."
1019552,Patient Services Coordinator- Mercy Health St....,12,Patient Services Coordinator- Mercy Health St....,False,False,False,"[-0.043357324, 0.23925032, -0.012365834, -0.00..."
1083675,Warehouse / Storeperson,3,Warehouse / Storeperson,False,False,False,"[-0.01344679, 0.109441295, -0.009284674, 0.077..."


# Task 2

In [2]:
data_with_embeddings = pd.read_parquet('./cache/data_with_embeddings.parquet')
data_with_embeddings.head()

Unnamed: 0,job_title,cleaned_job_title,embedding
1412234,Senior Research Ontologist,Senior Research Ontologist,"[-0.00582021, 0.27743036, -0.011652084, 0.0017..."
102919,Business Development Representative,Business Development Representative,"[-0.090618886, 0.14480697, -0.0056854617, 0.05..."
651406,Senior UI Developer,Senior UI Developer,"[-0.0669652, 0.21352673, -0.0032302842, 0.0349..."
1019552,Patient Services Coordinator- Mercy Health St....,Patient Services Coordinator- Mercy Health St....,"[-0.043357324, 0.23925032, -0.012365834, -0.00..."
1083675,Warehouse / Storeperson,Warehouse / Storeperson,"[-0.01344679, 0.109441295, -0.009284674, 0.077..."


In [3]:
job_levels = json.load(open('job-levels.json'))
job_areas = json.load(open('job-areas.json'))

In [4]:
list(job_levels.keys())

['Student',
 'Junior Specialist',
 'Assistant',
 'Specialist',
 'Sr Specialist',
 'Manager',
 'Director',
 'VP',
 'C-Level',
 'Founder/Owner']

In [5]:
list(job_areas.keys())

['Finance',
 'Engineering',
 'IT',
 'Healthcare',
 'Marketing',
 'Education',
 'Human Resources',
 'Operations',
 'Legal',
 'Customer Support',
 'Administration',
 'Design',
 'Construction',
 'Real Estate',
 'Manufacturing',
 'Media',
 'Science & Research',
 'Consulting',
 'Retail',
 'Transportation',
 'Hospitality & Tourism',
 'Energy & Utilities',
 'Agriculture & Forestry',
 'Arts & Entertainment',
 'Security',
 'Public Relations',
 'Government',
 'Nonprofit']

In [None]:
# I am going to use rule-based matching to pre-label the data

def keyword_extractor(text: str, mapping: dict[str, Any]) -> str:
    text = text.lower()
    for category, keywords in mapping.items():
        if any(keyword in text for keyword in keywords):
            return category
    return 'Unknown'


data_with_embeddings["job_level"] = data_with_embeddings["cleaned_job_title"].apply(lambda x: keyword_extractor(x, job_levels))
data_with_embeddings["job_area"] = data_with_embeddings["cleaned_job_title"].apply(lambda x: keyword_extractor(x, job_areas))

data_with_embeddings[["cleaned_job_title", "job_level", "job_area"]].head()

Unnamed: 0,cleaned_job_title,job_level,job_area
1412234,Senior Research Ontologist,Sr Specialist,Unknown
102919,Business Development Representative,Unknown,Public Relations
651406,Senior UI Developer,Specialist,IT
1019552,Patient Services Coordinator- Mercy Health St....,Assistant,Education
1083675,Warehouse / Storeperson,Unknown,Operations


In [7]:
labeled_data = data_with_embeddings[
    (data_with_embeddings['job_level'] != 'Unknown') 
    & (data_with_embeddings['job_area'] != 'Unknown')
]
labeled_data.head()

Unnamed: 0,job_title,cleaned_job_title,embedding,job_level,job_area
651406,Senior UI Developer,Senior UI Developer,"[-0.0669652, 0.21352673, -0.0032302842, 0.0349...",Specialist,IT
1019552,Patient Services Coordinator- Mercy Health St....,Patient Services Coordinator- Mercy Health St....,"[-0.043357324, 0.23925032, -0.012365834, -0.00...",Assistant,Education
164668,Lab Assistant PRN – Laboratory,Lab Assistant PRN – Laboratory,"[-0.0152007695, 0.15998304, -0.0038322255, -0....",Assistant,Science & Research
151095,Senior PHP Developer,Senior PHP Developer,"[-0.10911566, 0.3904516, -0.009450451, 0.12776...",Specialist,IT
360019,Head of Engineering Pumps UK,Head of Engineering Pumps UK,"[-0.12724742, -0.2857952, -0.0044779824, 0.056...",Specialist,Engineering


In [8]:
labeled_data['job_level'].value_counts()

job_level
Specialist           90301
Manager              27249
Assistant            19444
Sr Specialist        18516
Junior Specialist    12565
Student              10239
C-Level               5328
Director              4152
VP                    1795
Founder/Owner          437
Name: count, dtype: int64

In [9]:
labeled_data['job_area'].value_counts()

job_area
IT                        36901
Engineering               32993
Marketing                 15841
Public Relations          14938
Consulting                14401
Finance                   10279
Operations                 8285
Healthcare                 7471
Energy & Utilities         5868
Human Resources            5783
Legal                      5759
Design                     5153
Customer Support           5136
Administration             3770
Education                  3738
Science & Research         2799
Construction               2154
Hospitality & Tourism      2104
Security                   1644
Media                      1133
Retail                     1000
Real Estate                 755
Manufacturing               720
Arts & Entertainment        516
Government                  377
Transportation              310
Nonprofit                   146
Agriculture & Forestry       52
Name: count, dtype: int64

In [49]:
labeled_data.to_parquet('./cache/labeled_data.parquet')

In [10]:
not_labeled_data = data_with_embeddings[~data_with_embeddings.index.isin(labeled_data.index)]
not_labeled_data.head()

Unnamed: 0,job_title,cleaned_job_title,embedding,job_level,job_area
1412234,Senior Research Ontologist,Senior Research Ontologist,"[-0.00582021, 0.27743036, -0.011652084, 0.0017...",Sr Specialist,Unknown
102919,Business Development Representative,Business Development Representative,"[-0.090618886, 0.14480697, -0.0056854617, 0.05...",Unknown,Public Relations
1083675,Warehouse / Storeperson,Warehouse / Storeperson,"[-0.01344679, 0.109441295, -0.009284674, 0.077...",Unknown,Operations
202706,Operator/Labourer - Paving Crew,Operator/Labourer - Paving Crew,"[-0.21255884, 0.098604776, -0.013494413, 0.035...",Unknown,Science & Research
1339774,Seasonal Early Morning Replenishment PT,Seasonal Early Morning Replenishment PT,"[-0.14529106, -0.072939366, -0.010204792, 0.02...",Unknown,Unknown


We can use the rule-based extracted labels as a training dataset to train a classifier. This classifier will leverage embeddings from job titles and predict both job level and job area based on learned patterns, improving generalization beyond strict keyword matching.

I will utilize an architecture with Residual Blocks and Attention mechanisms to train a custom classifier model. This model will leverage job title embeddings to predict job levels and job areas, allowing us to automatically annotate all previously unknown data with high accuracy.

In [11]:
checkpoint_path = './cache/best_model.pth'

model, encoders = train_job_classifier(
    data=labeled_data,
    num_epochs=40,
    model_checkpoint_path=checkpoint_path,
    validation_size=0.2,
    batch_size=128,
    lr=1e-4,
    early_stopping_patience=5,
    random_state=42
)

Starting training on a device: mps.


100%|██████████| 1188/1188 [00:15<00:00, 78.48it/s]


Epoch 1/40 - Train Loss: 1.9603 | Val Loss: 0.9981
Val Job Level F1: 0.8885265088503786 | Val Job Area F1: 0.734313311267747
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 87.01it/s]


Epoch 2/40 - Train Loss: 1.0307 | Val Loss: 0.8109
Val Job Level F1: 0.917191185044963 | Val Job Area F1: 0.7801393703572985
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.02it/s]


Epoch 3/40 - Train Loss: 0.8390 | Val Loss: 0.7176
Val Job Level F1: 0.9271184080991732 | Val Job Area F1: 0.7975762747438238
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 87.64it/s]


Epoch 4/40 - Train Loss: 0.7252 | Val Loss: 0.6584
Val Job Level F1: 0.9284693002654629 | Val Job Area F1: 0.8141960262884151
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.16it/s]


Epoch 5/40 - Train Loss: 0.6494 | Val Loss: 0.6124
Val Job Level F1: 0.938268434129078 | Val Job Area F1: 0.8330752369150383
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 89.14it/s]


Epoch 6/40 - Train Loss: 0.5919 | Val Loss: 0.5859
Val Job Level F1: 0.9394501812655375 | Val Job Area F1: 0.8392990687103132
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 90.18it/s]


Epoch 7/40 - Train Loss: 0.5358 | Val Loss: 0.5766
Val Job Level F1: 0.9409510960593611 | Val Job Area F1: 0.8491902101713726
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 90.92it/s]


Epoch 8/40 - Train Loss: 0.5000 | Val Loss: 0.5591
Val Job Level F1: 0.9463814670796964 | Val Job Area F1: 0.8591302815909985
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 89.96it/s]


Epoch 9/40 - Train Loss: 0.4704 | Val Loss: 0.5387
Val Job Level F1: 0.9492004445042301 | Val Job Area F1: 0.8558207946021792
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.45it/s]


Epoch 10/40 - Train Loss: 0.4452 | Val Loss: 0.5196
Val Job Level F1: 0.9511369507110368 | Val Job Area F1: 0.8669273445487112
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.36it/s]


Epoch 11/40 - Train Loss: 0.4127 | Val Loss: 0.5135
Val Job Level F1: 0.9502223919475681 | Val Job Area F1: 0.87001200155831
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.86it/s]


Epoch 12/40 - Train Loss: 0.3947 | Val Loss: 0.5136
Val Job Level F1: 0.9542743756094656 | Val Job Area F1: 0.8678967589135785
No Improvement, Early Stop Counter: 1/5


100%|██████████| 1188/1188 [00:13<00:00, 88.10it/s]


Epoch 13/40 - Train Loss: 0.3769 | Val Loss: 0.5118
Val Job Level F1: 0.9566293236812131 | Val Job Area F1: 0.8749920426255379
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 88.73it/s]


Epoch 14/40 - Train Loss: 0.3645 | Val Loss: 0.4914
Val Job Level F1: 0.9593622764606468 | Val Job Area F1: 0.8804860620596233
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 89.12it/s]


Epoch 15/40 - Train Loss: 0.3536 | Val Loss: 0.4980
Val Job Level F1: 0.9583553780879034 | Val Job Area F1: 0.8826593035776734
No Improvement, Early Stop Counter: 1/5


100%|██████████| 1188/1188 [00:13<00:00, 90.28it/s]


Epoch 16/40 - Train Loss: 0.3274 | Val Loss: 0.4893
Val Job Level F1: 0.9593360175417073 | Val Job Area F1: 0.8852246799748845
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 90.07it/s]


Epoch 17/40 - Train Loss: 0.3150 | Val Loss: 0.4963
Val Job Level F1: 0.9549369503896001 | Val Job Area F1: 0.886660343939842
No Improvement, Early Stop Counter: 1/5


100%|██████████| 1188/1188 [00:13<00:00, 88.73it/s]


Epoch 18/40 - Train Loss: 0.3105 | Val Loss: 0.4949
Val Job Level F1: 0.9561100456778013 | Val Job Area F1: 0.8885861457034482
No Improvement, Early Stop Counter: 2/5


100%|██████████| 1188/1188 [00:13<00:00, 87.34it/s]


Epoch 19/40 - Train Loss: 0.3028 | Val Loss: 0.5109
Val Job Level F1: 0.9599308529001025 | Val Job Area F1: 0.8878087053828192
No Improvement, Early Stop Counter: 3/5


100%|██████████| 1188/1188 [00:13<00:00, 90.87it/s]


Epoch 20/40 - Train Loss: 0.2891 | Val Loss: 0.4679
Val Job Level F1: 0.9595121099987467 | Val Job Area F1: 0.8902628448835596
Model Saved (New Best Validation Loss)


100%|██████████| 1188/1188 [00:13<00:00, 91.05it/s]


Epoch 21/40 - Train Loss: 0.2756 | Val Loss: 0.4854
Val Job Level F1: 0.9607833040282934 | Val Job Area F1: 0.8930301713594792
No Improvement, Early Stop Counter: 1/5


100%|██████████| 1188/1188 [00:13<00:00, 89.16it/s]


Epoch 22/40 - Train Loss: 0.2733 | Val Loss: 0.5040
Val Job Level F1: 0.9606437062601514 | Val Job Area F1: 0.8921699297422474
No Improvement, Early Stop Counter: 2/5


100%|██████████| 1188/1188 [00:13<00:00, 90.59it/s]


Epoch 23/40 - Train Loss: 0.2645 | Val Loss: 0.5059
Val Job Level F1: 0.9616836497713317 | Val Job Area F1: 0.8940463347331778
No Improvement, Early Stop Counter: 3/5


100%|██████████| 1188/1188 [00:13<00:00, 91.29it/s]


Epoch 24/40 - Train Loss: 0.2534 | Val Loss: 0.4829
Val Job Level F1: 0.9613798104822451 | Val Job Area F1: 0.891961334569053
No Improvement, Early Stop Counter: 4/5


100%|██████████| 1188/1188 [00:13<00:00, 90.83it/s]


Epoch 25/40 - Train Loss: 0.2446 | Val Loss: 0.4948
Val Job Level F1: 0.9630287372020908 | Val Job Area F1: 0.8947792028851447
No Improvement, Early Stop Counter: 5/5
Early stopping triggered!


In [12]:
checkpoint_path = './cache/best_model.pth'
model.load_state_dict(torch.load(checkpoint_path, map_location=get_device()))

<All keys matched successfully>

In [27]:
def predict_job_title(job_embedding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    model.eval()
    with torch.no_grad():
        level_pred, area_pred = model(job_embedding)

    predicted_level = torch.argmax(level_pred, dim=1)
    predicted_area = torch.argmax(area_pred, dim=1)

    return predicted_level, predicted_area


to_pred = np.vstack(not_labeled_data['embedding'].values)
to_pred = torch.tensor(to_pred, dtype=torch.float32).to(get_device())

predicted_level, predicted_area = predict_job_title(to_pred)
predicted_level_encoded = predicted_level.cpu().numpy()
predicted_area_encoded = predicted_area.cpu().numpy()

In [28]:
not_labeled_data['predicted_level'] = encoders['level'].inverse_transform(predicted_level_encoded)
not_labeled_data['predicted_area'] = encoders['area'].inverse_transform(predicted_area_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_labeled_data['predicted_level'] = encoders['level'].inverse_transform(predicted_level_encoded)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_labeled_data['predicted_area'] = encoders['area'].inverse_transform(predicted_area_encoded)


In [None]:
test_sample = not_labeled_data[not_labeled_data['job_level'] != 'Unknown']

annotated = encoders['level'].transform(test_sample['job_level'])
predicted = encoders['level'].transform(test_sample['predicted_level'])

print(classification_report(annotated, predicted, target_names=encoders['level'].classes_))

                   precision    recall  f1-score   support

        Assistant       0.98      0.98      0.98     11223
          C-Level       0.73      0.88      0.80       956
         Director       0.92      0.99      0.95      2418
    Founder/Owner       0.87      0.92      0.90        65
Junior Specialist       0.96      0.96      0.96      3619
          Manager       0.98      0.97      0.97     14787
       Specialist       0.96      0.93      0.95      6933
    Sr Specialist       0.96      0.95      0.95      7727
          Student       0.92      0.94      0.93      2103
               VP       0.90      0.98      0.94       365

         accuracy                           0.96     50196
        macro avg       0.92      0.95      0.93     50196
     weighted avg       0.96      0.96      0.96     50196



In [45]:
test_sample = test_sample[test_sample['job_level'] != test_sample['predicted_level']]
test_sample[['cleaned_job_title', 'job_level', 'predicted_level']].sample(20)

Unnamed: 0,cleaned_job_title,job_level,predicted_level
1021985,Peer Specialist ACT/ACT TAY,Specialist,Junior Specialist
1285737,Offshore Relationship Manager (Malaysia),Manager,Student
517187,"Executive Director, Account Lead, Clinique Nor...",Sr Specialist,Director
152499,Technical Director/Head of Development,Manager,Director
468446,Raleigh-Durham GRE Subject Test in Biology Exp...,Specialist,C-Level
794061,Senior Agile Learning Specialist,Specialist,Sr Specialist
1160227,A Wider Circle hiring Senior Family Engagement...,Specialist,Sr Specialist
689010,"Invasive Weeds Research, Development & Engagem...",Manager,Specialist
423598,(Sr. Director) Disease Area Head - Rare Diseas...,Sr Specialist,Manager
872247,"Regional Account Leader, Andexxa - Chicago N",Sr Specialist,Junior Specialist


In [46]:
test_sample = not_labeled_data[not_labeled_data['job_area'] != 'Unknown']

annotated = encoders['area'].transform(test_sample['job_area'])
predicted = encoders['area'].transform(test_sample['predicted_area'])

print(classification_report(annotated, predicted, target_names=encoders['area'].classes_))

                        precision    recall  f1-score   support

        Administration       0.81      0.86      0.84      4504
Agriculture & Forestry       0.45      0.73      0.56        48
  Arts & Entertainment       0.25      0.70      0.37       304
          Construction       0.77      0.92      0.83      1831
            Consulting       0.08      0.64      0.14        58
      Customer Support       0.84      0.95      0.89      5914
                Design       0.72      0.69      0.71      3856
             Education       0.80      0.96      0.87      3909
    Energy & Utilities       0.35      0.73      0.47       710
           Engineering       0.63      0.57      0.60      1626
               Finance       0.95      0.84      0.89      5570
            Government       0.37      0.82      0.51        91
            Healthcare       0.86      0.92      0.89      9820
 Hospitality & Tourism       0.65      0.93      0.76      2005
       Human Resources       0.76      

In [47]:
test_sample = test_sample[test_sample['job_area'] != test_sample['predicted_area']]
test_sample[['cleaned_job_title', 'job_area', 'predicted_area']].sample(20)

Unnamed: 0,cleaned_job_title,job_area,predicted_area
214952,Brake Press Setter/Operator,Public Relations,Engineering
1076495,Automotive Mechanic- CHRISTUS EMS,Human Resources,Energy & Utilities
1405782,SA - Content and Data Management,IT,Marketing
1246041,Paid Media Account Executive,IT,Media
504298,Chronic Dialysis Travel Nursing Opportunity Av...,IT,Healthcare
782895,Vancouver Port - Security Guard - 18-24/hr,Human Resources,Security
105732,"Labor and Delivery - Travel Nurse RN - $1,623 ...",Healthcare,Science & Research
1209173,Captain,IT,Security
1291335,Bookkeeper/Accountants Assisant,Finance,Education
441546,Dispatcher Cdl Required First Transit,Design,Transportation


The results of the model’s performance are documented in the README file, including evaluation metrics and key insights. Additionally, potential improvements and next steps for enhancing accuracy, handling multilingual data, and optimizing model efficiency are also outlined there.