In [1]:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")

SKILLS = [
    "Python", "Java", "C++", "SQL",
    "Machine Learning", "Deep Learning",
    "Data Analysis", "Pandas", "NumPy",
    "TensorFlow", "PyTorch",
    "AWS", "Docker", "Kubernetes",
    "Git", "Linux"
]
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(skill) for skill in SKILLS]
matcher.add("TECHNICAL_SKILLS", patterns)

def extract_skills(resume_text):
    doc = nlp(resume_text)
    matches = matcher(doc)
    return sorted({doc[start:end].text for _, start, end in matches})
resume_text = """
Software engineer skilled in Python, Java, and SQL.
Worked on Machine Learning using TensorFlow and PyTorch.
Experience with AWS, Docker, Git, and Linux.
"""

print(extract_skills(resume_text))


['AWS', 'Docker', 'Git', 'Java', 'Linux', 'Machine Learning', 'PyTorch', 'Python', 'SQL', 'TensorFlow']


In [2]:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
SKILL_LIST = [
    "Python", "Java", "C++", "SQL",
    "Machine Learning", "Deep Learning",
    "Data Analysis", "Pandas", "NumPy",
    "TensorFlow", "PyTorch",
    "AWS", "Docker", "Git", "Linux"
]
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("SKILLS", [nlp.make_doc(skill) for skill in SKILL_LIST])

def match_skills(text):
    doc = nlp(text)
    matches = matcher(doc)
    return sorted(set(doc[start:end].text for _, start, end in matches))
text = """
I have experience in Python, SQL, and Data Analysis.
Worked with AWS and Docker, and built ML models using TensorFlow.
"""
print(match_skills(text))


['AWS', 'Data Analysis', 'Docker', 'Python', 'SQL', 'TensorFlow']


In [3]:
import spacy
from spacy.pipeline import EntityRuler
nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler", before="ner")
skill_patterns = [
    {"label": "SKILL", "pattern": "Python"},
    {"label": "SKILL", "pattern": "Java"},
    {"label": "SKILL", "pattern": "SQL"},
    {"label": "SKILL", "pattern": "Machine Learning"},
    {"label": "SKILL", "pattern": "Deep Learning"},
    {"label": "SKILL", "pattern": "Data Analysis"},
    {"label": "SKILL", "pattern": "TensorFlow"},
    {"label": "SKILL", "pattern": "PyTorch"},
    {"label": "SKILL", "pattern": "AWS"},
    {"label": "SKILL", "pattern": "Docker"},
    {"label": "SKILL", "pattern": "Git"},
    {"label": "SKILL", "pattern": "Linux"}
]
ruler.add_patterns(skill_patterns)

def extract_skills_ner(text):
    doc = nlp(text)
    return sorted({ent.text for ent in doc.ents if ent.label_ == "SKILL"})
job_description = """
We are looking for a data engineer with strong Python and SQL skills.
Experience in Machine Learning, AWS, Docker, and TensorFlow is required.
Knowledge of Git and Linux is a plus.
"""
print(extract_skills_ner(job_description))


['AWS', 'Docker', 'Git', 'Linux', 'Machine Learning', 'Python', 'SQL', 'TensorFlow']


In [4]:
def normalize_skills(skills):
    """
    skills: iterable of extracted skill strings
    returns: sorted list of unique, lowercase skills
    """
    return sorted({skill.strip().lower() for skill in skills})
extracted_skills = [
    "Python", "python", "SQL", "Machine Learning",
    "machine learning", "AWS", "aws"
]
print(normalize_skills(extracted_skills))



['aws', 'machine learning', 'python', 'sql']


In [5]:
def store_skills(technical_skills, soft_skills):
    """
    technical_skills: iterable of technical skills
    soft_skills: iterable of soft skills
    returns: dictionary with categorized skills
    """
    return {
        "technical_skills": sorted({skill.strip().lower() for skill in technical_skills}),
        "soft_skills": sorted({skill.strip().lower() for skill in soft_skills})
    }
tech_skills = ["Python", "SQL", "AWS", "Docker", "python"]
soft_skills = ["Communication", "Teamwork", "Problem Solving", "communication"]

skills_dict = store_skills(tech_skills, soft_skills)
print(skills_dict)


{'technical_skills': ['aws', 'docker', 'python', 'sql'], 'soft_skills': ['communication', 'problem solving', 'teamwork']}


In [7]:
def merge_skills(matcher_skills, ner_skills):
    """
    matcher_skills: list of skills from spaCy PhraseMatcher
    ner_skills: list of skills from spaCy NER / EntityRuler
    returns: sorted, unique, lowercase skill list
    """
    merged = set()

    for skill in matcher_skills + ner_skills:
        merged.add(skill.strip().lower())

    return sorted(merged)
skills_from_matcher = ["Python", "SQL", "AWS", "Docker"]
skills_from_ner = ["python", "Machine Learning", "AWS", "TensorFlow"]

merged_skills = merge_skills(skills_from_matcher, skills_from_ner)
print(merged_skills)


['aws', 'docker', 'machine learning', 'python', 'sql', 'tensorflow']


In [8]:
SKILL_NORMALIZATION_MAP = {
    "ml": "machine learning",
    "machine learning": "machine learning",
    "dl": "deep learning",
    "deep learning": "deep learning",
    "nlp": "natural language processing",
    "natural language processing": "natural language processing",
    "ai": "artificial intelligence",
    "artificial intelligence": "artificial intelligence",
    "js": "javascript",
    "javascript": "javascript"
}

def resolve_skill_conflicts(skills):
    """
    skills: iterable of skill strings
    returns: sorted list of canonical skill names
    """
    resolved = set()

    for skill in skills:
        key = skill.strip().lower()
        canonical = SKILL_NORMALIZATION_MAP.get(key, key)
        resolved.add(canonical)

    return sorted(resolved)
raw_skills = [
    "ML", "Machine Learning", "DL",
    "Python", "NLP", "Natural Language Processing",
    "AI", "Artificial Intelligence"
]

print(resolve_skill_conflicts(raw_skills))


['artificial intelligence', 'deep learning', 'machine learning', 'natural language processing', 'python']


In [10]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["I know Python and machine learning.", "Experience with SQL databases."]
master_skills = ["python", "sql", "machine learning", "deep learning"]
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
skill_embeddings = model.encode(master_skills, convert_to_tensor=True)
cos_scores = util.cos_sim(sentence_embeddings, skill_embeddings)
threshold = 0.6
matches = {}
for i, sentence in enumerate(sentences):
    matched_skills = [master_skills[j] for j, score in enumerate(cos_scores[i]) if score > threshold]
    matches[sentence] = matched_skills

print(matches)


{'I know Python and machine learning.': ['python'], 'Experience with SQL databases.': ['sql']}


In [11]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["I know Python and ML.", "I enjoy team building.", "Experienced in SQL."]
master_skills = ["python", "machine learning", "sql", "deep learning"]
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
skill_embeddings = model.encode(master_skills, convert_to_tensor=True)
cos_scores = util.cos_sim(sentence_embeddings, skill_embeddings)
threshold = 0.6
matches = {}
for i, sentence in enumerate(sentences):
    matched_skills = [master_skills[j] for j, score in enumerate(cos_scores[i]) if score > threshold]
    matches[sentence] = matched_skills

print(matches)


{'I know Python and ML.': ['python'], 'I enjoy team building.': [], 'Experienced in SQL.': ['sql']}


In [12]:
merged_skills = ["python", "communication", "ml", "teamwork", "sql"]
skill_mapping = {
    "technical": ["python", "java", "sql", "machine learning", "docker", "git", "ml"],
    "soft": ["communication", "teamwork", "leadership", "problem solving"]
}
categorized_skills = {"technical_skills": [], "soft_skills": []}

for skill in merged_skills:
    if skill.lower() in skill_mapping["technical"]:
        categorized_skills["technical_skills"].append(skill)
    elif skill.lower() in skill_mapping["soft"]:
        categorized_skills["soft_skills"].append(skill)

print(categorized_skills)


{'technical_skills': ['python', 'ml', 'sql'], 'soft_skills': ['communication', 'teamwork']}


In [13]:
import spacy
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer, util
nlp = spacy.load("en_core_web_sm")
master_skills = ["python", "java", "sql", "machine learning", "deep learning",
                 "docker", "git", "communication", "teamwork", "leadership", "problem solving"]
abbreviation_map = {"ml": "machine learning"}
skill_mapping = {
    "technical": ["python", "java", "sql", "machine learning", "deep learning", "docker", "git"],
    "soft": ["communication", "teamwork", "leadership", "problem solving"]
}
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp(skill) for skill in master_skills]
matcher.add("SKILLS", patterns)
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_skills(resume_text, similarity_threshold=0.6):
    doc = nlp(resume_text)
    matcher_matches = matcher(doc)
    skills_matcher = [doc[start:end].text.lower() for _, start, end in matcher_matches]
    skills_ner = [ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT", "SKILL"]]
    merged_skills = set(skills_matcher + skills_ner)
    merged_skills = [abbreviation_map.get(skill, skill) for skill in merged_skills]
    sentence_embeddings = bert_model.encode([resume_text], convert_to_tensor=True)
    skill_embeddings = bert_model.encode(master_skills, convert_to_tensor=True)
    cos_scores = util.cos_sim(sentence_embeddings, skill_embeddings)[0]

    for i, score in enumerate(cos_scores):
        if score > similarity_threshold:
            merged_skills.append(master_skills[i].lower())
    merged_skills = list(set(merged_skills))
    categorized_skills = {"technical_skills": [], "soft_skills": []}
    for skill in merged_skills:
        if skill in skill_mapping["technical"]:
            categorized_skills["technical_skills"].append(skill)
        elif skill in skill_mapping["soft"]:
            categorized_skills["soft_skills"].append(skill)

    return categorized_skills
resume_text = "Experienced in Python, ML, SQL and great teamwork skills."
skills = extract_skills(resume_text)
print(skills)


{'technical_skills': ['machine learning', 'sql'], 'soft_skills': ['teamwork']}


In [14]:
standardization_map = {
    "ml": "machine learning",
    "machine learning": "machine learning",
    "deep learning": "deep learning",
    "python": "python",
    "sql": "sql",
    "teamwork": "teamwork",
    "communication": "communication"
}
extracted_skills = ["ML", "machine learning", "Python", "SQL", "teamwork"]
standardized_skills = set()
for skill in extracted_skills:
    skill_lower = skill.lower()
    standardized_skills.add(standardization_map.get(skill_lower, skill_lower))

print(standardized_skills)


{'machine learning', 'teamwork', 'python', 'sql'}


In [15]:
spacy_skills = ["python", "machine learning", "sql"]
bert_skills = ["python", "ml", "deep learning"]
standardization_map = {
    "ml": "machine learning",
    "machine learning": "machine learning",
    "deep learning": "deep learning",
    "python": "python",
    "sql": "sql"
}
spacy_skills_std = set(standardization_map.get(skill.lower(), skill.lower()) for skill in spacy_skills)
bert_skills_std = set(standardization_map.get(skill.lower(), skill.lower()) for skill in bert_skills)
final_skills = spacy_skills_std.union(bert_skills_std - spacy_skills_std)

print(final_skills)


{'machine learning', 'python', 'sql', 'deep learning'}


In [16]:
import json
final_skills = {
    "technical_skills": ["python", "machine learning", "sql", "deep learning"],
    "soft_skills": ["teamwork", "communication"]
}
with open("final_skills.json", "w") as f:
    json.dump(final_skills, f, indent=4)

print("Saved final_skills.json successfully!")


Saved final_skills.json successfully!


In [17]:
ground_truth_skills = {"python", "machine learning", "sql", "teamwork", "communication"}
spacy_skills = {"python", "machine learning", "sql"}
bert_skills = {"python", "ml", "deep learning", "teamwork"}
combined_skills = {"python", "machine learning", "sql", "deep learning", "teamwork", "communication"}
standardization_map = {
    "ml": "machine learning",
    "machine learning": "machine learning",
    "deep learning": "deep learning",
    "python": "python",
    "sql": "sql",
    "teamwork": "teamwork",
    "communication": "communication"
}
bert_skills_std = set(standardization_map.get(skill.lower(), skill.lower()) for skill in bert_skills)
def evaluate(predicted, ground_truth):
    predicted_set = set(predicted)
    ground_truth_set = set(ground_truth)

    true_positives = len(predicted_set & ground_truth_set)
    false_positives = len(predicted_set - ground_truth_set)
    false_negatives = len(ground_truth_set - predicted_set)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1
pipelines = {
    "spaCy-only": spacy_skills,
    "BERT-only": bert_skills_std,
    "Combined": combined_skills
}

print("Skill Extraction Pipeline Comparison:")
for name, skills in pipelines.items():
    precision, recall, f1 = evaluate(skills, ground_truth_skills)
    print(f"{name}: Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")


Skill Extraction Pipeline Comparison:
spaCy-only: Precision=1.00, Recall=0.60, F1=0.75
BERT-only: Precision=0.75, Recall=0.60, F1=0.67
Combined: Precision=0.83, Recall=1.00, F1=0.91
