In [1]:
!pip uninstall -y fitz
!pip install --upgrade pymupdf



In [16]:
!pip install pdfplumber
import pdfplumber



In [2]:
import fitz  # from PyMuPDF
from pdf2image import convert_from_path
import pytesseract


In [3]:
!pip install crewai crewai-tools



In [5]:
!pip install tools



In [17]:
import fitz
import pandas as pd
import re
import uuid
import os
from crewai import Agent, Task

In [18]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text if text.strip() else "No text found"
    except Exception as e:
        return f" Error reading {pdf_path}: {e}"


In [19]:
SKILLS_DB = ["Python", "TensorFlow", "PyTorch", "SQL", "Excel", "Machine Learning",
             "Deep Learning", "Data Science", "Java", "C++", "JavaScript", "HTML", "CSS",
             "AWS", "Docker", "Kubernetes"]

EDU_PATTERNS = r"(B\.?Tech|M\.?Tech|B\.?Sc|M\.?Sc|MBA|Ph\.?D|Bachelor|Master|Diploma)"

KEYWORDS = ["AI", "ML", "NLP", "Computer Vision", "Blockchain", "Web Development", "Cloud"]

CATEGORY_MAP = {
    "AI/ML": ["Python", "TensorFlow", "PyTorch", "Machine Learning", "Deep Learning", "NLP"],
    "Data Science": ["Python", "SQL", "Excel", "Data Science"],
    "Web Development": ["JavaScript", "HTML", "CSS"],
    "Cloud/DevOps": ["AWS", "Docker", "Kubernetes"]
}

In [20]:
def extract_skills(text):
    return [s for s in SKILLS_DB if re.search(rf"\b{s}\b", text, re.IGNORECASE)]

def extract_education(text):
    return re.findall(EDU_PATTERNS, text, re.IGNORECASE)

def extract_keywords(text):
    return [kw for kw in KEYWORDS if re.search(rf"\b{kw}\b", text, re.IGNORECASE)]

def map_categories(signals):
    cats = []
    for cat, kws in CATEGORY_MAP.items():
        if any(s in signals for s in kws):
            cats.append(cat)
    return cats

In [21]:
def generate_integrity_token():
    return str(uuid.uuid4())

In [22]:
signal_agent = Agent(
    role="Signal Discovery Agent",
    goal="Extract skills, education, and keywords from resumes (PDF) and map them to categories",
    backstory="An expert in parsing resumes and discovering structured signals",
)

In [23]:
def signal_discovery_from_pdfs(pdf_folder="/content"):
    results = []
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            print(f"ðŸ“„ Processing: {pdf_file}")

            # Extract text with pdfplumber
            text = extract_text_from_pdf(pdf_path)
            print("Extracted text preview:", text[:300])

            skills = extract_skills(text)
            edu = extract_education(text)
            keywords = extract_keywords(text)
            categories = map_categories(skills + keywords)

            results.append({
                "File": pdf_file,
                "Skills": skills,
                "Education": edu,
                "Keywords": keywords,
                "Categories": categories,
                "Integrity_Token": generate_integrity_token()
            })
    return pd.DataFrame(results)


In [24]:
signal_task = Task(
    description="Run signal discovery on resume PDFs in the resumes folder",
    agent=signal_agent,
    expected_output="Structured table of skills, education, keywords, mapped categories, with integrity token"
)

In [25]:
df_output = signal_discovery_from_pdfs("/content")
print(df_output)

df_output.to_csv("processed_resumes.csv", index=False)


ðŸ“„ Processing: Kabir_resume_3.0.pdf
Extracted text preview: KABIR KOHLI
New Delhi,India
(cid:211) +91-8587883802  kabirkohliyuvi@gmail.com  Linkedin (cid:135) Github Ë† Google SkillBoost Ë† Codechef
EDUCATION
University School of Automation and Robotics, GGSIP University November 2021 â€“ July 2025
Bachelorâ€™s of Technology in Automation and Robotics - 8.1(3rd Ye
ðŸ“„ Processing: Kabir_resume_AI.pdf
Extracted text preview: KABIR KOHLI
New Delhi, India
(cid:131) +91-8587883802 # kabirkohliyuvi@gmail.com Linkedin Â§ Github â€¡ Google SkillBoost â€” Codechef
EDUCATION
University School of Automation and Robotics, GGSIP University November 2021 â€“ July 2026
Bachelor of Technology in Automation and Robotics New Delhi, India
TECH
ðŸ“„ Processing: Kabir_resume_2.0.pdf
Extracted text preview: KABIR KOHLI
New Delhi,India
(cid:211) +91-8587883802  kabirkohliyuvi@gmail.com  Linkedin (cid:135) Github Ë† Google SkillBoost Ë† Codechef
EDUCATION
University School of Automation and Robotics, GGSIP