In [1]:
import os
import PyPDF2
from tqdm import tqdm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'PyPDF2'

In [None]:
# --- Extract Resume Texts ---
RESUME_FOLDER = "data/data"


In [None]:
def extract_text_from_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return text


In [None]:
resume_texts = []
resume_files = []

all_files = [os.path.join(root, file)
             for root, dirs, files in os.walk(RESUME_FOLDER)
             for file in files if file.lower().endswith(".pdf")]

for file_path in tqdm(all_files, desc="Extracting PDFs"):
    text = extract_text_from_pdf(file_path)
    if text.strip():
        resume_texts.append(text)
        resume_files.append(file_path)

df_resumes = pd.DataFrame({"file": resume_files, "text": resume_texts})
print(f"Loaded {len(df_resumes)} resumes")

Extracting PDFs: 100%|██████████| 2484/2484 [06:07<00:00,  6.76it/s]

Loaded 2483 resumes





In [None]:
# --- Sample Job Description ---
job_description = """
Looking for a Data Scientist with Python, Machine Learning,
Deep Learning, and NLP experience.
"""

In [None]:
# --- TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(stop_words='english')
all_texts = [job_description] + df_resumes['text'].tolist()
tfidf_matrix = vectorizer.fit_transform(all_texts)


In [None]:
# --- Compute similarity scores ---
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
df_resumes['score'] = similarity_scores

In [None]:
df_resumes

Unnamed: 0,file,text,score
0,data/data/AGRICULTURE/37201447.pdf,ADULT EDUCATION INSTRUCTOR\nSummary\nSeasoned ...,0.060369
1,data/data/AGRICULTURE/12674256.pdf,FINANCIAL SALES CONSULTANT\nProfessional Summa...,0.000682
2,data/data/AGRICULTURE/29968330.pdf,EXTENSION METHODOLOGIST\nProfile\nSelf-motivat...,0.006205
3,data/data/AGRICULTURE/81042872.pdf,RESEARCH SCIENTIST\nSummary\nHighly motivated ...,0.064153
4,data/data/AGRICULTURE/20006992.pdf,"FRONT DESK CLERK (FEE BASIS, JOHN D DINGELL VA...",0.013501
...,...,...,...
2478,data/data/AUTOMOBILE/11257723.pdf,GENERAL LIABILITY CLAIM REPRESENTATIVE\nSummar...,0.001381
2479,data/data/AUTOMOBILE/11797122.pdf,AUTOMOBILE TRANSPORTER\nProfessional Summary\n...,0.004288
2480,data/data/AUTOMOBILE/22946204.pdf,"Highlights\nProg. Languages: \nC (5+ yrs), Pyt...",0.090598
2481,data/data/AUTOMOBILE/18932512.pdf,CUSTOMER RELATIONS SPECIALIST\nSummary\nTo obt...,0.005227


In [None]:
# --- Optional: Train a model to predict scores ---
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[1:], similarity_scores, test_size=0.2, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")

Mean Squared Error: 0.0000


In [None]:
# --- Rank resumes ---
ranked_resumes = df_resumes.sort_values(by='score', ascending=False)
print("\nTop 5 resumes:")
for idx, row in ranked_resumes.head(5).iterrows():
    print(f"{row['file']} — Score: {row['score']:.2f}")


Top 5 resumes:
data/data/ARTS/12777487.pdf — Score: 0.15
data/data/BANKING/34953092.pdf — Score: 0.13
data/data/DESIGNER/18835363.pdf — Score: 0.13
data/data/ENGINEERING/12011623.pdf — Score: 0.12
data/data/TEACHER/36206485.pdf — Score: 0.11


In [None]:
def calculate_teacher_score(resume_text):
    # --- Skills match ---
    skill_keywords = ["teaching", "classroom management", "lesson planning", "curriculum", "education", "pedagogy"]
    skills_score = sum(1 for skill in skill_keywords if skill.lower() in resume_text.lower()) / len(skill_keywords)
    
    # --- Experience match ---
    import re
    years = re.findall(r'(\d+)\s+years?', resume_text.lower())
    exp_score = min(sum(int(y) for y in years) / 20, 1)  # normalize to 0-1, max 20 years
    
    # --- Education match ---
    edu_keywords = ["bachelor", "master", "phd", "education degree", "teaching certification"]
    edu_score = sum(1 for edu in edu_keywords if edu in resume_text.lower()) / len(edu_keywords)
    
    # --- Weighted final score ---
    final_score = 0.4*skills_score + 0.4*exp_score + 0.2*edu_score
    return final_score

# Apply to your resumes
df_resumes['teacher_score'] = df_resumes['text'].apply(calculate_teacher_score)
df_resumes = df_resumes.sort_values(by='teacher_score', ascending=False)

print("\nTop 5 Teacher resumes:")
for idx, row in df_resumes.head(5).iterrows():
    print(f"{row['file']} — Teacher Score: {row['teacher_score']:.2f}")



Top 5 Teacher resumes:
data/data/TEACHER/21773106.pdf — Teacher Score: 0.79
data/data/AGRICULTURE/37201447.pdf — Teacher Score: 0.77
data/data/ARTS/37220856.pdf — Teacher Score: 0.77
data/data/AGRICULTURE/29897742.pdf — Teacher Score: 0.77
data/data/ARTS/12386670.pdf — Teacher Score: 0.75
