In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

1. DATA LOADING

In [42]:
data = pd.read_csv('job_title_des.csv')
data.head(1)

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...


2. DATA CLEANING

In [43]:
def cleanText(text):
  if pd.isna(text) or str(text).strip() == "":
    return ""
  text = ' '.join(str(text).split()).lower()
  return text

for chunk in data:
  if 'Job Description' in data.columns:
    data['Cleaned_description'] = data['Job Description'].apply(cleanText)
    print(data['Cleaned_description'].iloc[0])  # Use iloc for proper indexing
  else:
    print("No 'Job Description' column found in the data")


we are looking for hire experts flutter developer. so you are eligible this post then apply your resume. job types: full-time, part-time salary: ₹20,000.00 - ₹40,000.00 per month benefits: flexible schedule food allowance schedule: day shift supplemental pay: joining bonus overtime pay experience: total work: 1 year (preferred) housing rent subsidy: yes industry: software development work remotely: temporarily due to covid-19
we are looking for hire experts flutter developer. so you are eligible this post then apply your resume. job types: full-time, part-time salary: ₹20,000.00 - ₹40,000.00 per month benefits: flexible schedule food allowance schedule: day shift supplemental pay: joining bonus overtime pay experience: total work: 1 year (preferred) housing rent subsidy: yes industry: software development work remotely: temporarily due to covid-19
we are looking for hire experts flutter developer. so you are eligible this post then apply your resume. job types: full-time, part-time sal

In [44]:
#TfidfVectorizer Initializes
tfidf = TfidfVectorizer(stop_words='english',max_features=100)

#fit on JD
tfidf_matrix = tfidf.fit_transform(data['Cleaned_description'])

#get top keywords per jd
feature_names = tfidf.get_feature_names_out()
top_keywords = []
for i in range(len(data)):
  sorted_indices = tfidf_matrix[i].toarray().argsort()[0][-10:]
  top_keywords.append([feature_names[idx] for idx in sorted_indices])

data['top_keywords'] = top_keywords
data.head(2)


Unnamed: 0.1,Unnamed: 0,Job Title,Job Description,Cleaned_description,top_keywords
0,0,Flutter Developer,We are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...,"[year, developer, salary, looking, day, indust..."
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,python/django (developer/lead) - job code(pdj ...,"[communication, environment, high, testing, sq..."


In [45]:
pip install spacy pdfplumber python-multipart



3. RESUME PARSING

In [46]:
import pdfplumber
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load('en_core_web_sm')

skills_list = ["Python", "Machine Learning","Pytorch","AWS","Power Bi", "SQL"]

matcher = PhraseMatcher(nlp.vocab,attr = 'LOWER')
patterns = [nlp(text) for text in skills_list]
matcher.add("Skills", patterns)

def extract_text_from_pdf(pdf_path):
  with pdfplumber.open(pdf_path) as pdf:
    text = ' '.join(page.extract_text()for page in pdf.pages if page.extract_text())
  return text


def extract_skills(resume_text):
  doc = nlp(resume_text)
  skills = set()
  for match_id,start,end in matcher(doc):
    skills.add(doc[start:end].text)
  return list(skills)

#Example usage
pdf_path = "Jatin'sResume_Data_Science.pdf"
resume_text = extract_text_from_pdf(pdf_path)
skills = extract_skills(resume_text)
print("Resume Skills : ", skills)

Resume Skills :  ['Python', 'machine learning', 'PyTorch', 'Machine Learning', 'SQL']


3.2 SIMILARITY MATCHING

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_matchscore(jd_text,resume_text):
  jd_vector = tfidf.transform([jd_text])
  resume_vector = tfidf.transform([resume_text])
  return cosine_similarity(jd_vector,resume_vector)[0][0]

jd_sample = data['Cleaned_description'].iloc[2]
match_score = calculate_matchscore(jd_sample,resume_text)
print(f"Match Score : {match_score:.2%}")


Match Score : 76.85%


4.1 IDENTIFING MISSING KEYWORDS

In [48]:
# Text Vectorization
def get_missing_keywords(jd_text,resume_text,tfidf_vectorizor):
  jd_vector = tfidf_vectorizor.transform([jd_text])
  resume_vector = tfidf_vectorizor.transform([resume_text])

# Get TF-IDF scores for JD keywords
  jd_scores = zip(tfidf_vectorizor.get_feature_names_out(),jd_vector.toarray()[0])

#sort them in high to low
  sorted_jd = sorted(jd_scores,key=lambda x:x[1],reverse=True)

# Find top JD keywords missing in resume
  missing_keywords = []
  for word,score in sorted_jd[:20]:
    if word not in resume_text.lower():
      missing_keywords.append(word)
  return missing_keywords[:5]
missing = get_missing_keywords(jd_sample,resume_text,tfidf)
print(missing)

['development', 'software', 'ability', 'responsibilities', 'team']


Step 4.2: Generate Actionable Suggestions


In [49]:
def generate_suggestions(missing_keywords,resume_text):
  suggestions = []
  for keyword in missing_keywords:
    if keyword == "pandas":
          suggestions.append(f"Add 'Proficient in {keyword} for data manipulation' to Skills section")
    elif keyword == "machine learning":
          suggestions.append(f"Highlight ML projects: 'Built {keyword} models using Scikit-learn'")
    else:
          suggestions.append(f"Consider adding '{keyword}' to your resume")
    return suggestions

# Example
suggestions = generate_suggestions(missing, resume_text)
print("Suggestions:\n- " + "\n- ".join(suggestions))


Suggestions:
- Consider adding 'development' to your resume
