In [3]:
#project
# Step 1: Install packages & download spaCy model
!pip install -q pymupdf spacy scikit-learn pandas fpdf
!python -m spacy download en_core_web_sm

# Step 2: Import libraries
import fitz  # PyMuPDF
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fpdf import FPDF

nlp = spacy.load("en_core_web_sm")

# Step 3: Helper functions

def create_pdf(text, filename):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split('\n'):
        pdf.cell(0, 10, line, ln=1)
    pdf.output(filename)

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return " ".join(page.get_text() for page in doc)

def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

# Step 4: Create sample PDFs for resumes and job description

resume1_text = """John Doe
Experienced data scientist with expertise in Python, machine learning, and data analysis."""

resume2_text = """Jane Smith
Software engineer skilled in Java, cloud computing, and DevOps tools."""

job_description_text = """We are looking for a data scientist with strong Python skills and experience in machine learning."""

create_pdf(resume1_text, "resume1.pdf")
create_pdf(resume2_text, "resume2.pdf")
create_pdf(job_description_text, "job_description.pdf")

print("Sample PDFs created: resume1.pdf, resume2.pdf, job_description.pdf")

# Step 5: Extract, preprocess texts

resume_files = ["resume1.pdf", "resume2.pdf"]
job_desc_file = "job_description.pdf"

resume_texts = []
candidate_names = []

for file in resume_files:
    raw_text = extract_text_from_pdf(file)
    clean_text = preprocess_text(raw_text)
    resume_texts.append(clean_text)
    candidate_names.append(file)

job_raw_text = extract_text_from_pdf(job_desc_file)
job_text = preprocess_text(job_raw_text)

# Step 6: Vectorize and rank resumes

texts = resume_texts + [job_text]
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts)

job_vector = vectors[-1]
resume_vectors = vectors[:-1]

scores = cosine_similarity(resume_vectors, job_vector).ravel()

# Step 7: Create ranking dataframe and save CSV

df = pd.DataFrame({'Candidate': candidate_names, 'Score': scores})
df = df.sort_values(by='Score', ascending=False)
df.to_csv('report.csv', index=False)

print("Resume Ranking Results:")
display(df)

# Step 8: Download the CSV report
from google.colab import files
files.download('report.csv')


  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'fpdf' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'fpdf'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[

Unnamed: 0,Candidate,Score
0,resume1.pdf,0.524608
1,resume2.pdf,0.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

