In [5]:
import pandas as pd
import numpy as np
import re
import docx2txt
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

# Function to extract text from resumes
def extract_text(file_path):
    if file_path.endswith('.pdf'):
        text = ""
        with open(file_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() + " "
        return text
    elif file_path.endswith('.docx'):
        return docx2txt.process(file_path)
    else:
        return ""

# Load resumes from a folder
resume_folder = "/content/"  # Change to your folder
resumes = []
names = []
for file in os.listdir(resume_folder):
    if file.endswith(".pdf") or file.endswith(".docx"):
        resumes.append(extract_text(os.path.join(resume_folder, file)))
        names.append(file)

# Load job description
job_description = "We are looking for a Data Scientist with experience in Python, Machine Learning, and NLP."

# NLP Processing
vectorizer = TfidfVectorizer(stop_words='english')
resume_tfidf = vectorizer.fit_transform(resumes)
job_tfidf = vectorizer.transform([job_description])

# Calculate similarity scores
similarity_scores = cosine_similarity(job_tfidf, resume_tfidf)[0]
ranked_resumes = sorted(zip(names, similarity_scores), key=lambda x: x[1], reverse=True)

# Display ranked resumes
print("Ranked Resumes based on Job Description Match:\n")
for rank, (name, score) in enumerate(ranked_resumes, 1):
    print(f"{rank}. {name} - Similarity Score: {score:.2f}")

# Optional: Train a Machine Learning model (Example: RandomForest)
labels = np.array([1 if score > 0.5 else 0 for _, score in ranked_resumes])  # Labeling for training
X_train, X_test, y_train, y_test = train_test_split(resume_tfidf.toarray(), labels, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(f"\nResume Ranking Model Accuracy: {accuracy:.2f}")


Ranked Resumes based on Job Description Match:

1. resume_1.pdf - Similarity Score: 0.74
2. resume_2.pdf - Similarity Score: 0.05

Resume Ranking Model Accuracy: 0.00


In [4]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
