In [2]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.2-py3-none-any.whl.metadata (14 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.3.0-cp313-cp313-win_amd64.whl.metadata (9.2 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metad

In [3]:

import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util

df = pd.read_csv("../data/output_csvs/evaluated_resumes.csv")

job_description = """
We are seeking a Data Scientist with experience in Python, SQL, Machine Learning, NLP, and AWS. 
The ideal candidate should be proficient in building predictive models, handling large datasets, 
and working with cloud services. Strong communication and presentation skills are required.
"""
def extract_keywords(text):

    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    stopwords = {"the","and","are","with","for","our","this","that","from","your","you","but"}
    keywords = [w for w in words if w not in stopwords]
    return list(set(keywords))

jd_keywords = extract_keywords(job_description)

print("🔑 JD Keywords:", jd_keywords)


model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_match_score(resume_text, jd_text):
    emb_resume = model.encode(resume_text, convert_to_tensor=True)
    emb_jd = model.encode(jd_text, convert_to_tensor=True)
    similarity = util.cos_sim(emb_resume, emb_jd).item()
    return round(similarity * 100, 2)  # Scale to percentage


match_scores, matched_keywords, missing_keywords = [], [], []

for _, row in df.iterrows():
    resume_text = str(row.get("Raw_Text", ""))

    
    score = compute_match_score(resume_text, job_description)

    resume_words = set(re.findall(r'\b[a-zA-Z]{3,}\b', resume_text.lower()))
    found = [kw for kw in jd_keywords if kw in resume_words]
    missing = [kw for kw in jd_keywords if kw not in resume_words]

    match_scores.append(score)
    matched_keywords.append(", ".join(found))
    missing_keywords.append(", ".join(missing))

print("✅ Matching complete.")

# === Save Output ===
df["Match_Score"] = match_scores
df["Matched_Keywords"] = matched_keywords
df["Missing_Keywords"] = missing_keywords

output_path = "../data/output_csvs/matched_resumes.csv"
df.to_csv(output_path, index=False)
print(f"📁 Saved matched results to: {output_path}")


  from .autonotebook import tqdm as notebook_tqdm


🔑 JD Keywords: ['ideal', 'predictive', 'working', 'presentation', 'handling', 'strong', 'python', 'scientist', 'aws', 'building', 'large', 'datasets', 'sql', 'communication', 'learning', 'machine', 'seeking', 'data', 'candidate', 'proficient', 'nlp', 'required', 'skills', 'cloud', 'services', 'experience', 'models', 'should']


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
  return forward_call(*args, **kwargs)


✅ Matching complete.
📁 Saved matched results to: ../data/output_csvs/matched_resumes.csv


In [15]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai

# === Configure Gemini ===
genai.configure(api_key="YOUR_GEMINI_API_KEY")
gemini_model = genai.GenerativeModel("gemini-pro")

# === Load Evaluated Resumes ===
df = pd.read_csv("../data/output_csvs/evaluated_resumes.csv")

# === Sample Job Description ===
job_description = """
We are seeking a Data Scientist with experience in Python, SQL, Machine Learning, NLP, and AWS. 
The ideal candidate should be proficient in building predictive models, handling large datasets, 
and working with cloud services. Strong communication and presentation skills are required.
"""

# === Extract Keywords ===
def extract_keywords(text):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    stopwords = {"the","and","are","with","for","our","this","that","from","your","you","but"}
    return list(set([w for w in words if w not in stopwords]))

jd_keywords = extract_keywords(job_description)

print("🔑 JD Keywords:", jd_keywords)

# === Sentence Transformer Model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_match_score(resume_text, jd_text):
    emb_resume = model.encode(resume_text, convert_to_tensor=True)
    emb_jd = model.encode(jd_text, convert_to_tensor=True)
    similarity = util.cos_sim(emb_resume, emb_jd).item()
    return round(similarity * 100, 2)

# === Gemini Feedback Generator ===
def generate_match_feedback(resume_name, score, missing_keywords):
    prompt = f"""
    You are an expert career coach. Analyze why this resume scored {score}% match with the JD.
    Missing keywords: {missing_keywords}.
    Provide 3–4 bullet point suggestions to improve alignment for {resume_name}.
    """
    try:
        response = gemini_model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating feedback: {e}"

# === Matching Loop ===
match_scores, matched_keywords, missing_keywords, ai_feedback = [], [], [], []

for _, row in df.iterrows():
    resume_text = str(row.get("Raw_Text", ""))
    name = str(row.get("Name", "Candidate"))

    # 1. Semantic similarity
    score = compute_match_score(resume_text, job_description)

   # 2. Enhanced keyword match (fuzzy + synonyms)
    matched, missing = fuzzy_match_keywords(resume_text, jd_keywords)


    # 3. Gemini feedback
    feedback = generate_match_feedback(name, score, ", ".join(missing))

    match_scores.append(score)
    matched_keywords.append(", ".join(found))
    missing_keywords.append(", ".join(missing))
    ai_feedback.append(feedback)

print("✅ Matching & AI feedback complete.")

# === Save Results ===
df["Match_Score"] = match_scores
df["Matched_Keywords"] = matched_keywords
df["Missing_Keywords"] = missing_keywords
df["Match_Feedback"] = ai_feedback

output_path = "../data/output_csvs/matched_resumes.csv"
df.to_csv(output_path, index=False)
print(f"📁 Final output saved to: {output_path}")


🔑 JD Keywords: ['ideal', 'predictive', 'working', 'presentation', 'handling', 'strong', 'python', 'scientist', 'aws', 'building', 'large', 'datasets', 'sql', 'communication', 'learning', 'machine', 'seeking', 'data', 'candidate', 'proficient', 'nlp', 'required', 'skills', 'cloud', 'services', 'experience', 'models', 'should']


  return forward_call(*args, **kwargs)


✅ Matching & AI feedback complete.
📁 Final output saved to: ../data/output_csvs/matched_resumes.csv


In [16]:
!pip install fuzzywuzzy python-Levenshtein

import re
from fuzzywuzzy import fuzz

# === Expanded Synonym Map ===
synonym_map = {
    "ml": "machine learning",
    "nlp": "natural language processing",
    "aws": "amazon web services",
    "sql": "structured query language",
    "python": "python",
    "predictive": "predictive modeling",
    "model": "predictive modeling"
}

# === Extract Keywords with Synonyms ===
def extract_keywords(text):
    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())
    stopwords = {"the","and","are","with","for","our","this","that","from","your","you","but"}
    filtered = [w for w in words if w not in stopwords]
    expanded = []
    for w in filtered:
        expanded.append(w)
        if w in synonym_map:
            expanded.append(synonym_map[w])
    return list(set(expanded))

jd_keywords = extract_keywords(job_description)

# === Enhanced Matching Function ===
def fuzzy_match_keywords(resume_text, jd_keywords):
    resume_words = re.findall(r'\b[a-zA-Z]{2,}\b', resume_text.lower())
    matched, missing = [], []
    for kw in jd_keywords:
        found = any(fuzz.ratio(kw, rw) > 80 for rw in resume_words)  # fuzzy threshold 80%
        if found:
            matched.append(kw)
        else:
            missing.append(kw)
    return matched, missing




In [19]:
#  Filter Top Resumes by Threshold ===
threshold = 40  # Set your desired cutoff (e.g., 70%)

top_resumes = df[df["Match_Score"] >= threshold].sort_values(by="Match_Score", ascending=False)

print(f"✅ Found {len(top_resumes)} resumes with Match_Score >= {threshold}%")

# Save filtered resumes separately
top_output_path = "../data/output_csvs/top_matched_resumes.csv"
top_resumes.to_csv(top_output_path, index=False)

print(f"📁 Top matched resumes saved to: {top_output_path}")


✅ Found 6 resumes with Match_Score >= 40%
📁 Top matched resumes saved to: ../data/output_csvs/top_matched_resumes.csv


In [20]:
import shutil
import os

# === Auto-Copy Top Resumes ===
source_dir = "../data/Resumes/"        # Your main resume folder
target_dir = "../data/top_resumes/"    # New folder for shortlisted resumes
os.makedirs(target_dir, exist_ok=True)

for filename in top_resumes["Filename"]:
    src = os.path.join(source_dir, filename)
    dst = os.path.join(target_dir, filename)
    if os.path.exists(src):
        shutil.copy(src, dst)

print(f"📂 Copied {len(top_resumes)} top resumes to: {target_dir}")


📂 Copied 6 top resumes to: ../data/top_resumes/


In [21]:
# === Generate Recruiter Summary Report ===
summary_path = "../data/output_csvs/top_resume_summary.md"

with open(summary_path, "w", encoding="utf-8") as f:
    f.write("# 📄 Top Resume Summary Report\n\n")
    f.write(f"**Total Resumes Shortlisted:** {len(top_resumes)}\n\n")

    for idx, row in top_resumes.iterrows():
        f.write(f"## {row['Name']} (Match Score: {row['Match_Score']}%)\n")
        f.write(f"- **Matched Keywords:** {row['Matched_Keywords']}\n")
        f.write(f"- **Missing Keywords:** {row['Missing_Keywords']}\n")
        f.write(f"- **AI Feedback:**\n  {row['Match_Feedback']}\n\n")
        f.write("---\n\n")

print(f"📝 Summary report generated: {summary_path}")


📝 Summary report generated: ../data/output_csvs/top_resume_summary.md
