In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# 📓 Resume Text Cleaning and JD Matching

import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nlp = spacy.load("en_core_web_sm")



In [None]:
def clean_text(text):
    # Lowercase and remove special characters
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [None]:
def extract_keywords(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "SKILL", "WORK_OF_ART"]]

In [None]:
# Sample JD
jd = """
We are looking for a skilled Python developer with experience in machine learning, natural language processing (NLP), and REST API integration. Familiarity with spaCy, Scikit-learn, and Flask is preferred.
"""

In [None]:
# Sample resumes
resumes = [
    "I am a Python developer with 3 years experience in NLP and building REST APIs. I've used Flask, Scikit-learn and spaCy in various projects.",
    "Experienced front-end developer with React, Vue.js and some exposure to backend APIs. No NLP or Python skills.",
    "Junior data analyst with some Python experience and basic ML understanding using scikit-learn."
]

In [None]:
# Clean everything
jd_clean = clean_text(jd)
resumes_clean = [clean_text(r) for r in resumes]

In [None]:
# TF-IDF
vectorizer = TfidfVectorizer()
all_docs = [jd_clean] + resumes_clean
tfidf_matrix = vectorizer.fit_transform(all_docs)

In [None]:
# Similarity
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
for i, score in enumerate(similarities):
    print(f"Resume {i+1} similarity score: {score:.2f}")

In [None]:
# Named entity extraction
print("\n🔍 JD Keywords:", extract_keywords(jd))