### TSDAE: Fine-tune sentence transformers using unsupervised learning with Pytorch
https://www.sbert.net/examples/unsupervised_learning/TSDAE/README.html

In [2]:
# !pip install sentence_transformers==2.2.2

In [None]:
import pandas as pd
import numpy as np
import string
from tqdm import tqdm
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

In [5]:
def get_data(path: str, n_samples: int=30000, random_state=42):
    '''
    Return preprocessed data of job descriptions
    '''
    data = pd.read_csv(path, usecols=['description'])
    data = data.sample(n=n_samples, replace=False, random_state=random_state)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    return data

In [6]:
data = get_data('jobscrap.csv')

In [7]:
len(data)

23418

In [8]:
data['description'][1000]

'Thank you for your interest in the New Jersey Turnpike Authority (NJTA). Serving the Garden State Parkway and New Jersey Turnpike.\n\nWe are looking to hire seasonal maintenance workers this season! Are you ready to join the organization that operates two of the busiest toll roads in North America? If so, apply now!\n\nApplying for seasonal employment? All seasonal applications must be completed in our online employment system. The process is easy!'

In [9]:
def finetune_model(data: pd.DataFrame, col_to_use: str='description', 
                   model_id: str="bert-base-uncased", 
                   batch_size: int=8, epochs: int=1):
    '''
    Fine-tune sentence transformer using unsupervised learning on job description data.
    '''
#     https://www.sbert.net/examples/unsupervised_learning/TSDAE/README.html
    
    word_embedding_model = models.Transformer(model_id)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    
    train_examples = data[col_to_use].tolist()
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_examples)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_id, tie_encoder_decoder=True)    
    
    model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
    )
    model_save_path = model_id + '_finetuned'
    model.save(model_save_path)
    return model_save_path

In [10]:
# fine-tune sentence transformer
finetuned_model_id = finetune_model(data=data)
finetuned_model = SentenceTransformer(finetuned_model_id)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2928 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Downloading .gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)CoreML/model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading weight.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading (…)ackage/Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [11]:
# non fine-tuned sentence transformer
non_finetuned_model_id = "bert-base-uncased"
non_finetuned_model = SentenceTransformer(non_finetuned_model_id)

### INFERENCE

In [12]:
def get_sent_transformer_embeddings(sent_transformer, txt):
    '''
    Function to get sentence embeddings from SentenceTransformer using specified model
    '''
    embedding = sent_transformer.encode(txt, show_progress_bar=False)
    return embedding

In [18]:
data = get_data('jobscrap.csv', n_samples=30000, random_state=11)

In [19]:
def get_job_embeddings(data, sent_transformer):
    '''
    Get sentence embeddings of job descriptions using a specified transformer
    '''
    data_slice = data.copy()
    data_slice = data_slice['description']
    job_embeddings = None
    for job in tqdm(data_slice):
        if job_embeddings is None:
            job_embeddings = np.expand_dims(get_sent_transformer_embeddings(sent_transformer, job), 
                                            axis=0)
        else:
            job_embeddings = np.concatenate([job_embeddings, 
                                             np.expand_dims(get_sent_transformer_embeddings(sent_transformer, job), 
                                                            axis=0)])
    return job_embeddings

In [20]:
# Get embeddings of job descriptions in the database using fine-tuned model
finetuned_job_emb = get_job_embeddings(data, finetuned_model)

100%|██████████| 23609/23609 [15:43<00:00, 25.03it/s]


In [21]:
# Get embeddings of job descriptions in the database using non fine-tuned model
non_finetuned_job_emb = get_job_embeddings(data, non_finetuned_model)

100%|██████████| 23609/23609 [15:29<00:00, 25.40it/s]


In [22]:
def get_similarity_score(emb1, emb2):
    '''
    Function to compute cosine-similarity score.
    '''
    cos_sim = np.dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    return cos_sim

In [31]:
def inference(job_desc: str, top_n=5):
    '''
    Return similar jobs based on resume summary by fine-tuned and non fine-tuned models
    '''
    finetuned_inf_emb = get_sent_transformer_embeddings(finetuned_model, job_desc)
    non_finetuned_inf_emb = get_sent_transformer_embeddings(non_finetuned_model, job_desc)
    
    
    finetuned_similarities = []
    non_finetuned_similarities = []
    for i in tqdm(range(len(finetuned_job_emb))):
        finetuned_similarities.append(get_similarity_score(finetuned_job_emb[i], finetuned_inf_emb))
        non_finetuned_similarities.append(get_similarity_score(non_finetuned_job_emb[i], non_finetuned_inf_emb))
    
    finetuned_similarities = np.array(finetuned_similarities)
    non_finetuned_similarities = np.array(non_finetuned_similarities)
    finetuned_similarities_idx = np.argsort(finetuned_similarities)[::-1][:top_n]
    non_finetuned_similarities_idx = np.argsort(non_finetuned_similarities)[::-1][:top_n]
    
    finetuned_similar_jobs = data.iloc[finetuned_similarities_idx, :].copy()
    finetuned_similar_jobs['similarity_score'] = finetuned_similarities[finetuned_similarities_idx]
    
        
    non_finetuned_similar_jobs = data.iloc[non_finetuned_similarities_idx, :].copy()
    non_finetuned_similar_jobs['similarity_score'] = non_finetuned_similarities[finetuned_similarities_idx]
    
    finetuned_similar_jobs_txt = ""
    non_finetuned_similar_jobs_txt = ""
    for i in range(len(finetuned_similar_jobs)):
        finetuned_similar_jobs_txt += f"\n\nSimilarity score: {finetuned_similar_jobs.iloc[i, 1]}\n{finetuned_similar_jobs.iloc[i, 0][:500]}"
        non_finetuned_similar_jobs_txt += f"\n\nSimilarity score: {non_finetuned_similar_jobs.iloc[i, 1]}\n{non_finetuned_similar_jobs.iloc[i, 0][:500]}"
        
    print(f"Input Job:\n{job_desc}\n\nSimilar Jobs Returned by Fine-tuned Model:\n{finetuned_similar_jobs_txt}" +
          f"\n\nSimilar Jobs Returned by Non Fine-tuned Model:\n{non_finetuned_similar_jobs_txt}")

In [34]:
resume_summary = '''
8+ years experience in software development and design skills
Experience building user-facing features, APIs and framework on the Mobile side.
Strong object oriented programming skills using C++/Swift/Java/Objective-C
Experience conducting exploratory or generative research
Machine Learning Modelling from experimentation and prototyping to deployment into production pipelines is preferred.
Automation and Scripting skills in Python
Excellent problem solving, communication and documentation skills
Knowledge of networking/wireless protocols will be useful.
Able to work independently and should be able to drive requirements and design across the teams.
'''
inference(job_desc=resume_summary, top_n=5)

100%|██████████| 23609/23609 [00:00<00:00, 25524.43it/s]

Input Job:

8+ years experience in software development and design skills
Experience building user-facing features, APIs and framework on the Mobile side.
Strong object oriented programming skills using C++/Swift/Java/Objective-C
Experience conducting exploratory or generative research
Machine Learning Modelling from experimentation and prototyping to deployment into production pipelines is preferred.
Automation and Scripting skills in Python
Excellent problem solving, communication and documentation skills
Knowledge of networking/wireless protocols will be useful.
Able to work independently and should be able to drive requirements and design across the teams.


Similar Jobs Returned by Fine-tuned Model:


Similarity score: 0.9381912350654602
Main Responsibilities:
As software engineer intern, you will build robust and scalable software, participate in brainstorming sessions and supply ideas to our technology, algorithms, and products, and work with the product and design teams to unde




In [35]:
resume_summary = '''
Identify, design, and implement internal process improvements Automating manual processes, optimizing data delivery, re-designing infrastructure for greater scalability
Build the infrastructure required for optimal data extraction, transformation and loading of data from variety of data sources using SQL/R/Phyton and AWS “big data” technologies
Work with data science team to assess and troubleshoot potential data quality issues at input and transformation
Independently handling all projects with minimal supervision
Contribute towards building thought leadership in the Pricing domain
Rich experience as an ETL developer with strong data architecture knowledge around data warehousing concepts, SQL development and optimization, operational support models.
Bachelor’s / Master’s Degree in Information technology / Computer science / Computer applications
Experience performing root cause analysis on internal and external data and processes to answer specific question and identify opportunity for improvements
Strong analytical skills related to working with unstructured data
Experience in data mining tools/techniques and bigdata
Excellent verbal and written communication.
Comfortable working and interacting with senior client stakeholders
Good team player and demonstrates leadership by driving initiatives
Experience supporting and working with cross functional team in dynamic environment
'''
inference(job_desc=resume_summary, top_n=5)

100%|██████████| 23609/23609 [00:00<00:00, 24835.09it/s]

Input Job:

Identify, design, and implement internal process improvements Automating manual processes, optimizing data delivery, re-designing infrastructure for greater scalability
Build the infrastructure required for optimal data extraction, transformation and loading of data from variety of data sources using SQL/R/Phyton and AWS “big data” technologies
Work with data science team to assess and troubleshoot potential data quality issues at input and transformation
Independently handling all projects with minimal supervision
Contribute towards building thought leadership in the Pricing domain
Rich experience as an ETL developer with strong data architecture knowledge around data warehousing concepts, SQL development and optimization, operational support models.
Bachelor’s / Master’s Degree in Information technology / Computer science / Computer applications
Experience performing root cause analysis on internal and external data and processes to answer specific question and identify o




In [36]:
resume_summary = '''
Supervisory and leadership skills
Knowledge of construction methods, standards, and regulations
Experience in managing resources and schedules
Excellent communication and interpersonal skills
Ability to read and interpret construction documents, blueprints, and drawings
Strong problem-solving and decision-making skills
Knowledge of computer applications such as MS Office and project management software
Degree or diploma in Civil Engineering or related field
3-5 years of experience as a Site Supervisor in the construction industry
'''
inference(job_desc=resume_summary, top_n=5)

100%|██████████| 23609/23609 [00:00<00:00, 25376.53it/s]

Input Job:

Supervisory and leadership skills
Knowledge of construction methods, standards, and regulations
Experience in managing resources and schedules
Excellent communication and interpersonal skills
Ability to read and interpret construction documents, blueprints, and drawings
Strong problem-solving and decision-making skills
Knowledge of computer applications such as MS Office and project management software
Degree or diploma in Civil Engineering or related field
3-5 years of experience as a Site Supervisor in the construction industry


Similar Jobs Returned by Fine-tuned Model:


Similarity score: 0.9090361595153809
Position Summary
The Intern Architect must be an energetic, self-motivated, pro-active, organized, and well-rounded individual that has a basic understanding of technical aspects of architecture. In addition, this individual must possess the skills required to successfully execute project tasks, have a strong work ethic, and be a dynamic team player. This role will 


