In [22]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer , util

In [23]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")


In [24]:
df_bert = pd.read_csv(r'D:\Projects\ATS_Model\cs_resume_dataset_realistic.csv')
df_1 = pd.read_csv(r"D:\Projects\ATS_Model\Resume.csv")
df_2 = pd.read_csv(r"D:\Projects\ATS_Model\UpdatedResumeDataSet.csv")
df_3 = pd.read_csv(r"D:\Projects\ATS_Model\Preprocessed_Data.csv")
df_bert = df_bert.rename(columns={'SKILLS': 'resume_text', 'JOB_CATEGORY': 'category'})
df_1 = df_1.rename(columns={'Resume_str': 'resume_text', 'Category': 'category'})
df_2 = df_2.rename(columns={'Resume': 'resume_text', 'Category': 'category'})
df_3 = df_3.rename(columns={'Text': 'resume_text', 'Category': 'category'})
merged_df = pd.concat([df_bert, df_1, df_2, df_3], ignore_index=True)
merged_df = merged_df.drop(columns=['ID','Resume_html'])
X_bert = merged_df['resume_text'].tolist()
y_bert = merged_df['category'].tolist()

In [25]:
X_embeddings = model.encode(X_bert, show_progress_bar=True, convert_to_tensor=False)

Batches:   0%|          | 0/2089 [00:00<?, ?it/s]

In [32]:
class_embeddings = defaultdict(list)
for emb , label in zip(X_embeddings,y_bert):
    class_embeddings[label].append(emb)

centroids = {label: np.mean(np.vstack(embs), axis=0) for label, embs in class_embeddings.items()}

def score_resume(text, model=model, centroids=centroids):
    q = model.encode([text], convert_to_tensor=False)[0]
    sims = {}
    for label, cent in centroids.items():
        s = util.cos_sim(q, cent).item()   # similarity
        sims[label] = (s + 1) / 2          # normalize -1..1 → 0..1
    return dict(sorted(sims.items(), key=lambda x: x[1], reverse=True))

In [33]:
resume_text = "Python, Java, C, C++, HTML, CSS, JavaScript, SQL, NoSQL, VS Code, Git, GitHub, Figma, WSL,: Linux, Node.js, Express.js, JWT, Passport.js, React.js, Vite, Socket.io , WebRTC, PDFKit,Mongoose, Bootstrap, REST API, EJS, Postman, Docker, AWS Lambda,MySQL, MongoDB, DynamoDB, AWS, Render, NGINX, Apache"
scores = score_resume(resume_text)


In [34]:
def score_for_job(text, job_name, model=model, centroids=centroids):
    q = model.encode([text], convert_to_tensor=False)[0]
    if job_name not in centroids:
        raise ValueError(f"Job '{job_name}' not found in categories.")
    
    s = util.cos_sim(q, centroids[job_name]).item()
    return (s + 1) / 2   # normalized 0–1


In [53]:
def closest_job_name(query, model, labels):
    q_emb = model.encode([query], convert_to_tensor=True)
    label_embs = model.encode(labels, convert_to_tensor=True)

    sims = util.cos_sim(q_emb, label_embs)[0]  # similarities to all labels
    best_idx = sims.argmax().item()
    
    return labels[best_idx], sims[best_idx].item()
    

# Example usage
job_labels = list(centroids.keys())
match, score = closest_job_name("sde", model, job_labels)

print(f"Closest job to 'Frontend' is '{match}' with similarity {score:.4f}")


Closest job to 'Frontend' is 'SDET' with similarity 0.7176


In [None]:
job_name = "testing"   # try a specific category name from your dataset

match, scoreo = closest_job_name(job_name, model, job_labels)

score = score_for_job(resume_text, match)
print(f"Match with {job_name}: {score*100:.1f}%")


Match with testing: 63.1%


In [55]:
top3 = list(scores.items())[:3]

print("Top 3 independent matches:")
for job, score in top3:
    print(f"{job}: {score*100:.1f}%")


Top 3 independent matches:
Full Stack Developer: 90.7%
Frontend Developer: 88.1%
Backend Developer: 87.2%


In [56]:
import pickle

# Dump centroids + job labels
with open("ats_data.pkl", "wb") as f:
    pickle.dump({
        "centroids": centroids,
        "job_labels": list(centroids.keys())
    }, f)
