In [1]:
import PyPDF2



In [2]:
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [3]:

sample_text = extract_text_from_pdf(r"C:\Users\lakshmi\OneDrive\Desktop\ml projects\data\data\DESIGNER\11807040.pdf")

print(sample_text[:500])

FORMS DESIGNER
Professional Summary
Professionally trained 
[job title]
 
with experience ensuring high standards of culturally competent care for wide variety of patients with diverse
needs. Responsible 
[job title]
 
with excellent communication skills demonstrated by 
[number]
 
years of experience in healthcare.
Skills
Strong clinical judgment
High level of autonomy
Patient/family focused
Patient evaluation/intervention
Professional bedside manner
Adept at prioritizing/managing deadlines
Saf


In [4]:
import os

In [5]:
resume_folder = r"C:\Users\lakshmi\OneDrive\Desktop\ml projects\data\data\DESIGNER"

resumes = []

for file in os.listdir(resume_folder):
    if file.endswith(".pdf"):
        full_path = os.path.join(resume_folder, file)
        text = extract_text_from_pdf(full_path)
        resumes.append(text)

len(resumes)

107

In [6]:
job_description = """
Looking for candidates with skills in Python, Machine Learning, Data Analysis, SQL, Statistics, and Problem Solving.
"""

job_description

'\nLooking for candidates with skills in Python, Machine Learning, Data Analysis, SQL, Statistics, and Problem Solving.\n'

In [7]:
documents = resumes + [job_description]

len(documents)

108

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)

tfidf_matrix.shape

(108, 8322)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(
    tfidf_matrix[-1],    # Job Description (last row)
    tfidf_matrix[:-1]    # All resumes
)

similarity_scores.shape

(1, 107)

In [11]:
scores = similarity_scores.flatten()

scores[:10]

array([0.03928298, 0.05326723, 0.11089013, 0.07702872, 0.05692772,
       0.06534277, 0.1019987 , 0.06828086, 0.11823834, 0.0568362 ])

In [12]:
ranked_indices = scores.argsort()[::-1]

ranked_indices[:10]

array([29, 66,  8, 42, 90,  2, 77, 59, 33, 25])

In [13]:
for i in ranked_indices[:5]:
    print("Candidate Score:", scores[i])
    print("-" * 40)

Candidate Score: 0.13815303119069688
----------------------------------------
Candidate Score: 0.12449355263967507
----------------------------------------
Candidate Score: 0.1182383395586597
----------------------------------------
Candidate Score: 0.11478942555639313
----------------------------------------
Candidate Score: 0.11223783286882029
----------------------------------------


In [14]:
required_skills = [
    "python",
    "machine learning",
    "data analysis",
    "sql",
    "statistics"
]

In [15]:
def find_missing_skills(resume_text, skills):
    resume_text = resume_text.lower()
    missing = [skill for skill in skills if skill not in resume_text]
    return missing

In [16]:
top_candidate_index = ranked_indices[0]

top_resume_text = resumes[top_candidate_index]

missing_skills = find_missing_skills(top_resume_text, required_skills)

missing_skills

['python', 'machine learning', 'data analysis', 'sql', 'statistics']

In [18]:
import os

dataset_folder = r"C:\Users\lakshmi\OneDrive\Desktop\ml projects\data\data"

resumes = []

for category in os.listdir(dataset_folder):
    category_path = os.path.join(dataset_folder, category)
    
    if os.path.isdir(category_path):
        for file in os.listdir(category_path):
            if file.endswith(".pdf"):
                full_path = os.path.join(category_path, file)
                text = extract_text_from_pdf(full_path)
                
                if text:          # avoids blank PDFs
                    resumes.append(text)

len(resumes)

2483

In [19]:
job_description = """
Looking for a creative UI/UX Designer with experience in Figma,
wireframing, prototyping, user research, design systems,
visual hierarchy, and interaction design.
"""

documents = resumes + [job_description]

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

tfidf_matrix.shape

(2484, 42698)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(
    tfidf_matrix[-1],      # JD
    tfidf_matrix[:-1]      # Resumes
)

scores = similarity_scores.flatten()

In [22]:
ranked_indices = scores.argsort()[::-1]

scores[ranked_indices[:10]]

array([0.34534173, 0.30014227, 0.16825739, 0.15917263, 0.15688471,
       0.15231213, 0.15031007, 0.15029234, 0.14955441, 0.13845279])

In [24]:
top_n = 10
top_candidates = ranked_indices[:top_n]

top_candidates

array([1357, 1334, 1281, 1254,  364, 1306, 1311, 1326, 1333, 1338])

In [25]:
for rank, idx in enumerate(top_candidates, start=1):
    print(f"Rank {rank}")
    print("Similarity Score:", scores[idx])
    print("-" * 50)

Rank 1
Similarity Score: 0.34534173212533814
--------------------------------------------------
Rank 2
Similarity Score: 0.3001422675025145
--------------------------------------------------
Rank 3
Similarity Score: 0.1682573858733621
--------------------------------------------------
Rank 4
Similarity Score: 0.15917262539866775
--------------------------------------------------
Rank 5
Similarity Score: 0.15688470752423533
--------------------------------------------------
Rank 6
Similarity Score: 0.15231213406885524
--------------------------------------------------
Rank 7
Similarity Score: 0.15031007109352412
--------------------------------------------------
Rank 8
Similarity Score: 0.15029234040631836
--------------------------------------------------
Rank 9
Similarity Score: 0.14955441034803746
--------------------------------------------------
Rank 10
Similarity Score: 0.1384527899738154
--------------------------------------------------


In [26]:
for rank, idx in enumerate(top_candidates[:3], start=1):
    print(f"\n===== TOP CANDIDATE {rank} =====")
    print(resumes[idx][:1000])   # first part of resume
    print("\n")


===== TOP CANDIDATE 1 =====
Y
FREELANCE UX/UI INTERACTION DESIGNER
Summary
Combined with work experience in 
Architecture
 
to move forward with a new career in 
User-Centered Design
 . The priority is to create great
experiences for users while always keeping in mind 
company needs and goals
 .
Experience
Freelance UX/UI Interaction Designer
 
City
 
, 
State
 
Company Name
 
/
 
Apr 2017
 
to 
Current
LegalChat App DesignÂ 
Created Interactive 
Prototypes
 , Low to High-Fidelity 
Wireframes
 , logic flows, 
visual design
 
and Icon
Provided content strategy, UI design direction, 
User experience design
Freelance UX Researcher
 
City
 
, 
State
 
Company Name
 
/
 
Apr 2017
 
to 
May 2017
UsabilityÂ Testing for Airbrush App
Scheduled and conducted remote 
usability test
 s to observe user behavior on interaction
Discovered 
insights
 
and created 
infographic
 
report for company
UX/UI Designer
 
City
 
, 
State
 
Company Name
 
/
 
Mar 2017
 
to 
Apr 2017
Website Redesign
 
forÂ Bay

In [27]:
jd_text = job_description.lower()

jd_words = jd_text.split()

jd_words[:20]

['looking',
 'for',
 'a',
 'creative',
 'ui/ux',
 'designer',
 'with',
 'experience',
 'in',
 'figma,',
 'wireframing,',
 'prototyping,',
 'user',
 'research,',
 'design',
 'systems,',
 'visual',
 'hierarchy,',
 'and',
 'interaction']

In [28]:
common_words = {"looking", "for", "with", "and", "the", "a", "of", "in"}

required_skills_dynamic = [word for word in jd_words if word not in common_words]

required_skills_dynamic[:20]

['creative',
 'ui/ux',
 'designer',
 'experience',
 'figma,',
 'wireframing,',
 'prototyping,',
 'user',
 'research,',
 'design',
 'systems,',
 'visual',
 'hierarchy,',
 'interaction',
 'design.']

In [29]:
for rank, idx in enumerate(top_candidates[:5], start=1):
    print(f"\n===== Candidate Rank {rank} =====")
    print("Similarity Score:", scores[idx])
    
    missing = find_missing_skills(resumes[idx], required_skills_dynamic)
    
    print("Missing JD Keywords:", missing[:10])
    print("-" * 60)


===== Candidate Rank 1 =====
Similarity Score: 0.34534173212533814
Missing JD Keywords: ['creative', 'figma,', 'wireframing,', 'prototyping,', 'systems,', 'hierarchy,', 'design.']
------------------------------------------------------------

===== Candidate Rank 2 =====
Similarity Score: 0.3001422675025145
Missing JD Keywords: ['ui/ux', 'figma,', 'wireframing,', 'research,', 'systems,', 'hierarchy,', 'interaction']
------------------------------------------------------------

===== Candidate Rank 3 =====
Similarity Score: 0.1682573858733621
Missing JD Keywords: ['ui/ux', 'figma,', 'wireframing,', 'prototyping,', 'systems,', 'visual', 'hierarchy,', 'interaction', 'design.']
------------------------------------------------------------

===== Candidate Rank 4 =====
Similarity Score: 0.15917262539866775
Missing JD Keywords: ['ui/ux', 'experience', 'figma,', 'wireframing,', 'prototyping,', 'user', 'research,', 'systems,', 'hierarchy,', 'interaction']
---------------------------------------