In [1]:
# !pip install huggingface_hub
# !pip install -U "accelerate>=0.26.0"
# !pip install transformers[torch]
# !pip install -q sentence-transformers torch pandas tqdm

In [2]:
# from huggingface_hub import login

# login("hf_Your_key")

In [3]:
# from huggingface_hub import snapshot_download

# # Choose where to save the files. Change if you like.
# local_dir = r"C:\Users\Jay\Desktop\GenAI\Candidate_recommandation\data"

# snapshot_download(
#     repo_id="netsol/resume-score-details",
#     repo_type="dataset",
#     local_dir=local_dir,
#     local_dir_use_symlinks=False,
#     # allow_patterns=["data/*.json"],  # Only get the data/*.json files!
# )

# print("✅ Download complete!")


In [4]:
import os
import glob
import json
import pandas as pd
import random
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Replace with the path where your dataset JSONs are extracted
DATA_PATH = r'C:\Users\Jay\Desktop\GenAI\Candidate_recommandation\data'

matched, mismatched = [], []

for path in tqdm(glob.glob(os.path.join(DATA_PATH, '*.json'))):
    with open(path, 'r', encoding='utf-8') as f:
        entry = json.load(f)
    jd = entry["input"].get("job_description", "")
    resume = entry["input"].get("resume", "")
    macro_scores = entry.get("output", {}).get("scores", {}).get("aggregated_scores", {}).get("macro_scores", None)

    # Heuristic: macro_scores >= 3.5 means "matched", <= 2.0 means "mismatched"
    if jd and resume and macro_scores is not None:
        if macro_scores >= 3.5:
            matched.append({'jd': jd, 'resume': resume, 'label': 1})
        elif macro_scores <= 2.0:
            mismatched.append({'jd': jd, 'resume': resume, 'label': 0})

print(f"Found {len(matched)} matched and {len(mismatched)} mismatched samples.")

100%|████████████████████████████████████████████████████████████████████████████| 1031/1031 [00:00<00:00, 1296.13it/s]

Found 741 matched and 39 mismatched samples.





In [6]:
# Balance and shuffle
random.shuffle(matched)
random.shuffle(mismatched)
balanced_data = matched[:200] + mismatched[:200]
random.shuffle(balanced_data)
df_balanced = pd.DataFrame(balanced_data)
print(df_balanced['label'].value_counts())
df_balanced.head()

label
1    200
0     39
Name: count, dtype: int64


Unnamed: 0,jd,resume,label
0,### Business Development Manager\n\n**Fictiona...,Muhammad Bilal Amin \n+92-323-321-5863 | mbil...,1
1,**Business Development Manager** \nJoin [Fict...,Fresh Electrical Engineering graduate with str...,0
2,Join **TechVentures Inc.** as a **Senior Proje...,\nMuhammad Tayyab Aftab \n \nPROFESSIONAL ...,1
3,**Title: Business Development Manager** \n\n*...,SULTAN MOUZAM\nCHUGHTAI \nB2B|B2C SALES & MARK...,1
4,### Business Development Manager\n\nJoin **Inn...,"Addres\nJohar Town, Lahore\nPhone\n+92 334 705...",1


In [7]:
# Prepare SBERT Training Examples
train_examples = [
    InputExample(texts=[row.jd, row.resume], label=float(row.label))
    for _, row in df_balanced.iterrows()
]
print(f"Total training pairs: {len(train_examples)}")

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

Total training pairs: 239


In [8]:
# Load Base Model
base_model_name = 'all-MiniLM-L6-v2'  # You can try others from SBERT docs
model = SentenceTransformer(base_model_name)

In [9]:
# Fine-Tune the Model

train_loss = losses.CosineSimilarityLoss(model=model)

epochs = 3  # More epochs if you want (2-4 typical for this size)
def loss_printer(score, epoch, step, total_steps):
    print(f"Epoch {epoch} Step {step}/{total_steps} | Training Loss: {score:.4f}", flush=True)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=epochs,
    warmup_steps=50,
    show_progress_bar=True,
    callback=loss_printer
)



Step,Training Loss


In [12]:
# Save the Fine-Tuned Model

save_path = './fine-tuned-sbert-resume-matcher-binary'
model.save(save_path)
print(f"Model saved to {save_path}")



Model saved to ./fine-tuned-sbert-resume-matcher-binary


In [13]:
# Quick Test: Check Your Model

model = SentenceTransformer(save_path)
# Try a relevant pair and an irrelevant pair
jd_ex = df_balanced[df_balanced['label']==1].iloc[0]
resume_ex_good = jd_ex.resume
jd_text = jd_ex.jd
# Pick a mismatched resume
resume_ex_bad = df_balanced[df_balanced['label']==0].iloc[0].resume

emb_jd = model.encode([jd_text])[0]
emb_good = model.encode([resume_ex_good])[0]
emb_bad = model.encode([resume_ex_bad])[0]

from numpy import dot
from numpy.linalg import norm
sim_good = dot(emb_jd, emb_good) / (norm(emb_jd) * norm(emb_good))
sim_bad = dot(emb_jd, emb_bad) / (norm(emb_jd) * norm(emb_bad))
print(f"Similarity (Good Pair): {sim_good:.4f}")
print(f"Similarity (Bad Pair): {sim_bad:.4f}")

Similarity (Good Pair): 0.8546
Similarity (Bad Pair): 0.3498
