In [1]:
# ===============================
# Resume Screening NLP Project
# ===============================

import os
import re
import pandas as pd
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords


In [2]:
# -------------------------------
# Step 1: Read resumes from folder
# -------------------------------

folder = "Resumes"

texts = []
names = []

for file in os.listdir(folder):

    if file.lower().endswith(".docx"):

        path = os.path.join(folder, file)

        doc = Document(path)

        # combine all paragraphs into single text
        text = "\n".join([p.text for p in doc.paragraphs])

        texts.append(text)
        names.append(file)


print(f" Total resumes loaded: {len(texts)}")
print("\nSample preview:\n")
print(texts[0][:400])


 Total resumes loaded: 228

Sample preview:

Name: Abiral Pandey
Email: abiral.pandey88@gmail.com
Phone: 940-242-3303
Current Location: Woonsocket, Rhode Island
Visa Status: US Citizen

SUMMARY:
Dynamic individual with 6 years of software development experience in design, development, deployment, maintenance, production and support of web - based and Client-Server business applications using OOP and Java/J2EE technologies.
Exposure to all ph


In [3]:
# -------------------------------
# Step 2: Text Cleaning
# -------------------------------

stop_words = set(stopwords.words("english"))

def clean_text(text):

    text = text.lower()                    # lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # remove symbols

    words = text.split()

    # remove stopwords
    words = [w for w in words if w not in stop_words]

    return " ".join(words)


cleaned_texts = [clean_text(t) for t in texts]

print("Before:\n", texts[0][:200])
print("\nAfter:\n", cleaned_texts[0][:200])


Before:
 Name: Abiral Pandey
Email: abiral.pandey88@gmail.com
Phone: 940-242-3303
Current Location: Woonsocket, Rhode Island
Visa Status: US Citizen

SUMMARY:
Dynamic individual with 6 years of software develo

After:
 name abiral pandey email abiral pandey gmail com phone current location woonsocket rhode island visa status us citizen summary dynamic individual years software development experience design developme


In [4]:
# -------------------------------
# Step 3: Job description input
# -------------------------------

job_description = """
python sql machine learning pandas flask data analysis
"""

job_description = clean_text(job_description)


# -------------------------------
# Step 4: TF-IDF Vectorization
# -------------------------------

docs = cleaned_texts + [job_description]

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(docs)


# -------------------------------
# Step 5: Cosine similarity
# -------------------------------

scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]


# -------------------------------
# Step 6: Create ranking table
# -------------------------------

df = pd.DataFrame({
    "Resume": names,
    "Score": scores
})

df["Match %"] = (df["Score"] * 100).round(2)

df = df.sort_values("Score", ascending=False).reset_index(drop=True)


In [5]:
# -------------------------------
# Extra: Skill extraction
# -------------------------------

skills = [
    "python", "sql", "machine learning", "pandas",
    "flask", "java", "excel", "tableau",
    "powerbi", "aws"
]

def extract_skills(text):
    found = [s for s in skills if s in text]
    return ", ".join(found)


df["Skills Found"] = [extract_skills(t) for t in cleaned_texts]


In [6]:
# -------------------------------
# Step 7: Final output
# -------------------------------

print("\n Top Candidates:\n")

display(df.head(10))


# save result for HR
df.to_csv("shortlisted_candidates.csv", index=False)

print("\n File saved: shortlisted_candidates.csv")



 Top Candidates:



Unnamed: 0,Resume,Score,Match %,Skills Found
0,Vijay Bhargav.docx,0.093124,9.31,"sql, java, excel, aws"
1,Kashyap K. Vora resume.docx,0.089017,8.9,"python, sql, java, excel, aws"
2,vema reddy.docx,0.064865,6.49,excel
3,Manohar Reddy.docx,0.06411,6.41,"java, excel"
4,Vivek.BSA.docx,0.056996,5.7,"sql, java, excel"
5,Madhu_BA_AW.DOCX,0.056063,5.61,"sql, java, excel, tableau"
6,Amar Sr BSA.docx,0.05326,5.33,"sql, java, excel, tableau"
7,Mani_Hadoop.docx,0.051147,5.11,"sql, java, excel, aws"
8,manish_singh_resume.docx,0.050943,5.09,"sql, java, excel, tableau, aws"
9,Mohammad Resume.docx,0.050049,5.0,"sql, excel"



 File saved: shortlisted_candidates.csv
