In [2]:
#!/usr/bin/env python3
"""
job_parser.py

A modular, rule-based job description parser that extracts:
- title
- role_summary
- responsibilities (list)
- technical_requirements (list)
- education
- experience (min_years)
- employment_type
- location
- salary
- raw_text

Usage:
    python job_parser.py

Notes:
 - This implementation prioritizes deterministic rules and heuristics.
 - It's intentionally self-contained and works without heavy ML dependencies.
 - Hooks are left as TODOs where you can plug spaCy / transformer / LLM enhancements later.
"""

import re
import json
import math

# ---------- CONFIG / Vocabulary ----------
# You can expand these lists to improve recall
TECH_VOCAB = [
    "python","java","javascript","typescript","c++","c#","golang","go","ruby","rust",
    "nodejs","node","node.js","django","flask","fastapi","spring","express","react",
    "angular","vue","aws","azure","gcp","docker","kubernetes","postgresql","mysql",
    "mongodb","redis","html","css","graphql","rest","grpc","git","sql","tensorflow",
    "pytorch","spark","hadoop","elasticsearch","airflow","rabbitmq","kafka","ci/cd",
    "terraform","ansible","linux","bash"
]

EDUCATION_KEYWORDS = [
    "bachelor", "b.sc", "bsc", "ba", "bs", "b.eng", "beng", "master", "m.sc", "msc", "ms",
    "mba", "phd", "doctorate", "associate", "high school", "secondary"
]

EMPLOYMENT_TYPES = [
    "full[- ]time", "part[- ]time", "contract", "temporary", "internship", "intern", "freelance",
    "remote", "onsite", "hybrid"
]

RESPONSIBILITY_HEADINGS = [
    "responsibilities", "what you'll do", "what you will do", "you will", "you'll be responsible",
    "role and responsibilities", "what we need from you", "what we're looking for"
]

QUALIFICATION_HEADINGS = [
    "requirements", "qualifications", "what we are looking for", "what you'll need", "must have",
    "what you bring", "skills and qualifications", "must have"
]

SALARY_CURRENCY = r'(\$|usd|£|eur|€)\s?[\d,]+(?:\.\d+)?'
SALARY_RANGE = r'((?:' + SALARY_CURRENCY + r')(?:\s*(?:-|to)\s*(?:' + SALARY_CURRENCY + r'))?)'
YEARS_RE = re.compile(r'(\d+)(?:\+|\s*\+)?\s*(?:years|yrs|year)', flags=re.I)
SALARY_RE = re.compile(SALARY_RANGE, flags=re.I)
EDUCATION_RE = re.compile(r'|'.join(re.escape(k) for k in EDUCATION_KEYWORDS), flags=re.I)


In [3]:

def split_into_blocks(text: str):
    """
    Split on double newlines and preserve bullets.
    Return list of blocks; each block is dict {heading: optional, text: ...}
    Heuristic: If a block's first line looks like a heading, treat it separately.
    """
    blocks = []
    # Normalize bullets into new lines (so bullets become their own lines)
    text = re.sub(r'•|\u2022', '-', text)
    # Split paragraphs by two or more newlines
    paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
    for p in paragraphs:
        lines = [l.strip() for l in p.splitlines() if l.strip()]
        heading = None
        body = p
        if lines:
            first = lines[0]
            # heuristics for heading
            if (first.endswith(":") or first.isupper() or len(first.split()) <= 6 and
                any(k in first.lower() for k in ["responsib", "require", "qualification", "what", "role"])):
                heading = first.rstrip(":").strip()
                body = "\n".join(lines[1:]).strip()
        blocks.append({"heading": heading, "text": body or ""})
    return blocks

def extract_bullets(text: str):
    """Return bullet lines from a text block."""
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    bullets = []
    for l in lines:
        if re.match(r'^[-*\u2022]\s+', l) or l.startswith(("•", "-", "*")) or l.startswith(("•",)):
            bullets.append(re.sub(r'^[-*\u2022]\s+', '', l).strip())
        else:
            # treat full sentence lines that start with a verb as a bullet candidate
            if re.match(r'^(Design|Build|Implement|Develop|Manage|Own|Lead|Collaborate|Work)\b', l, flags=re.I):
                bullets.append(l)
    return bullets

# ---------- FIELD EXTRACTORS ----------

def find_section_by_heading(blocks, keywords):
    """Return combined text from blocks whose heading contains any of the keywords."""
    matched = []
    for b in blocks:
        if b.get("heading"):
            h = b["heading"].lower()
            if any(k in h for k in keywords):
                matched.append(b["text"] or "")
    return "\n\n".join(matched).strip()

def find_section_by_keyword_presence(blocks, keywords):
    """Return text from blocks where the content contains heading-keyword anywhere (fallback)."""
    matched = []
    for b in blocks:
        if b["text"] and any(k in b["text"].lower() for k in keywords):
            matched.append(b["text"])
    return "\n\n".join(matched).strip()

def extract_title(text: str, blocks):
    """Candidate title extraction: heading, first line, or meta-like patterns."""
    # If first block has heading and looks like a title
    if blocks:
        first_block = blocks[0]
        if first_block.get("heading") and len(first_block["heading"].split()) <= 8:
            return first_block["heading"]
    # try first non-empty line as title if it's short
    first_line = next((l for l in text.splitlines() if l.strip()), "")
    if first_line and len(first_line.split()) <= 12 and len(first_line) < 120:
        # filter out lines that look like company names or "Job posted x days ago"
        if not re.search(r'posted|ago|apply|“|”', first_line, flags=re.I):
            return first_line.strip()
    # fallback: search for "Job title - Company" patterns
    m = re.search(r'^(?P<title>.+?)\s*[-|@]\s*(?P<company>[A-Z][A-Za-z0-9 &,.]+)$', first_line)
    if m:
        return m.group("title").strip()
    return None

def extract_role_summary(blocks):
    """
    Role summary is often the short paragraph near the top that doesn't have a heading.
    We pick the first block with moderate length and no heading (or a heading like 'About the role')
    """
    for b in blocks[:3]:
        if not b.get("heading") and 40 < len(b.get("text","")) < 600:
            return b["text"].strip()
    # check blocks with heading 'about' or 'about the role'
    for b in blocks:
        if b.get("heading") and "about" in b["heading"].lower():
            return b["text"].strip()
    return None

def extract_responsibilities(blocks):
    # 1. Heading matches
    resp_text = find_section_by_heading(blocks, RESPONSIBILITY_HEADINGS)
    if not resp_text:
        # 2. fallback: qualification-like headings but look for bullet verbs
        resp_text = find_section_by_heading(blocks, ["what you'll", "you will", "you'll be"])
    if not resp_text:
        # 3. fallback: look for paragraphs containing many verbs (heuristic)
        candidate = []
        for b in blocks:
            txt = b.get("text","")
            if txt:
                # count lines starting with verbs
                bullets = extract_bullets(txt)
                if len(bullets) >= 2:
                    candidate.append(txt)
        resp_text = "\n\n".join(candidate)
    bullets = []
    if resp_text:
        bullets = extract_bullets(resp_text)
        # if still empty, split by sentences
        if not bullets:
            bullets = [s.strip() for s in re.split(r'\.\s+', resp_text) if len(s.strip())>10]
    # unique-preserve-order
    seen = set()
    out = []
    for b in bullets:
        k = b.lower()
        if k not in seen:
            seen.add(k)
            out.append(b.strip().rstrip('.'))
    return out

def extract_technical_requirements(text, blocks):
    """
    Match tech vocab tokens; return sorted by frequency
    """
    text_lower = text.lower()
    found = {}
    # search for tech vocab words as whole words
    for tech in TECH_VOCAB:
        pattern = r'\b' + re.escape(tech.lower()) + r'\b'
        if re.search(pattern, text_lower):
            found[tech] = found.get(tech, 0) + len(re.findall(pattern, text_lower))
    # also look in 'skills' or 'tech' sections
    skills_section = find_section_by_heading(blocks, ["skills", "tech", "technology", "technical requirements", "technical skills"])
    if skills_section:
        for tech in TECH_VOCAB:
            if re.search(r'\b' + re.escape(tech.lower()) + r'\b', skills_section.lower()):
                found[tech] = found.get(tech, 0) + len(re.findall(r'\b' + re.escape(tech.lower()) + r'\b', skills_section.lower()))
    # sort by frequency
    ordered = sorted(found.items(), key=lambda kv: (-kv[1], kv[0]))
    return [k for k,v in ordered]

def extract_education(text):
    m = EDUCATION_RE.search(text or "")
    if not m:
        return None
    found = m.group(0)
    # simple normalization
    found_norm = found.lower().replace(".", "").strip()
    if "bachelor" in found_norm or found_norm in ["bsc", "bsc"]:
        degree = "Bachelor's"
    elif "master" in found_norm or found_norm in ["msc", "ms"]:
        degree = "Master's"
    elif "mba" in found_norm:
        degree = "MBA"
    elif "phd" in found_norm or "doctor" in found_norm:
        degree = "PhD"
    else:
        degree = found.capitalize()
    # check for 'required' or 'preferred'
    required = bool(re.search(r'(required|must have|must possess|required:)', text, flags=re.I))
    return {"degree": degree, "raw": found, "required": required}

def extract_experience(text):
    # look for first explicit years mention
    m = YEARS_RE.search(text or "")
    if m:
        try:
            years = int(m.group(1))
            return {"min_years": years, "raw": m.group(0)}
        except Exception:
            pass
    # fallback: look for 'senior/mid/junior' heuristics
    if re.search(r'\bsenior\b|\blead\b', text, flags=re.I):
        return {"min_years": 5, "raw": "senior"}
    if re.search(r'\bmid\b|\bmid[- ]senior\b', text, flags=re.I):
        return {"min_years": 3, "raw": "mid"}
    if re.search(r'\bjunior\b|\bentry[- ]level\b|\bentry level\b', text, flags=re.I):
        return {"min_years": 0, "raw": "junior"}
    return None

def extract_employment_type(text):
    for pattern in EMPLOYMENT_TYPES:
        if re.search(pattern, text, flags=re.I):
            # normalize common ones
            raw = re.search(pattern, text, flags=re.I).group(0)
            if re.search(r'full', raw, flags=re.I):
                return "Full-time"
            if re.search(r'part', raw, flags=re.I):
                return "Part-time"
            if re.search(r'contract', raw, flags=re.I):
                return "Contract"
            if re.search(r'intern', raw, flags=re.I):
                return "Internship"
            if re.search(r'freelance', raw, flags=re.I):
                return "Freelance"
            if re.search(r'remote', raw, flags=re.I):
                return "Remote"
            return raw
    return None

def extract_location(text):
    # quick heuristics: look for "Remote", "Hybrid", or city, state patterns like "San Francisco, CA"
    if re.search(r'\bremote\b', text, flags=re.I):
        return "Remote"
    # match "City, ST" e.g., "New York, NY"
    m = re.search(r'([A-Z][a-zA-Z]+(?:[ \-][A-Z][a-zA-Z]+)*),\s*([A-Z]{2})\b', text)
    if m:
        return f"{m.group(1)}, {m.group(2)}"
    # look for "United States" or country names
    if re.search(r'\b(United States|USA|Canada|UK|United Kingdom)\b', text):
        return re.search(r'\b(United States|USA|Canada|UK|United Kingdom)\b', text).group(0)
    return None

def extract_salary(text):
    m = SALARY_RE.search(text or "")
    if not m:
        # look for patterns like "$120k - $150k" or "120k-150k"
        m2 = re.search(r'\$\s?([\d,]+)k', text or "", flags=re.I)
        if m2:
            try:
                val = int(m2.group(1).replace(',', '')) * 1000
                return {"min": val, "max": val, "currency": "$", "raw": m2.group(0)}
            except:
                pass
        return None
    raw = m.group(1)
    # attempt to extract numeric min/max and currency
    nums = re.findall(r'(\$|usd|£|€)?\s?([\d,]+(?:\.\d+)?)', raw, flags=re.I)
    values = []
    currency = None
    for cur, num in nums:
        if cur:
            currency = cur.strip()
        try:
            v = float(num.replace(',', ''))
            values.append(v)
        except:
            pass
    if not values:
        return {"raw": raw}
    if len(values) == 1:
        v = values[0]
        # guess period text (annum/hour)
        period = "year" if re.search(r'per year|/year|yr|year|annum', text, flags=re.I) else None
        return {"min": values[0], "max": values[0], "currency": currency or "$", "period": period, "raw": raw}
    if len(values) >= 2:
        mn = min(values)
        mx = max(values)
        return {"min": mn, "max": mx, "currency": currency or "$", "raw": raw}
    return {"raw": raw}

# ---------- MAIN PARSING FUNCTION ----------

def parse_job_description(raw_input: str, source_name: str = "unknown"):
    """
    Main pipeline:
    1. Clean HTML
    2. Split into blocks
    3. Extract fields using rules / heuristics
    4. Return structured dict
    """
    text = raw_input
    blocks = split_into_blocks(text)
    title = extract_title(text, blocks)
    role_summary = extract_role_summary(blocks)
    responsibilities = extract_responsibilities(blocks)
    technical_requirements = extract_technical_requirements(text, blocks)
    education = extract_education(text)
    experience = extract_experience(text)
    employment_type = extract_employment_type(text)
    location = extract_location(text)
    salary = extract_salary(text)

    # canonical JSON
    out = {
        "source": source_name,
        "title": title,
        "role_summary": role_summary,
        "responsibilities": responsibilities,
        "technical_requirements": technical_requirements,
        "education": education,
        "experience": experience,
        "employment_type": employment_type,
        "location": location,
        "salary": salary,
        "raw_text": text
    }
    return out


In [4]:

# ---------- DEMO & TESTS ----------

EXAMPLE_LINKEDIN = """
Senior Backend Engineer

We are building an industry-first analytics platform that powers
real-time decisions for users. As a Senior Backend Engineer,
you'll design and implement scalable APIs that serve millions of requests.

Responsibilities:
- Design and implement microservices in Python.
- Build high-throughput data pipelines using Kafka and Spark.
- Write tests and participate in code reviews.
- Work closely with product and data science teams.

Qualifications:
- Bachelor's degree in Computer Science or equivalent experience.
- 5+ years of experience building backend systems.
- Hands-on with Python, Django, PostgreSQL, Docker, Kubernetes.
- Experience with AWS (EC2, S3, Lambda) is a plus.

Location: San Francisco, CA (Hybrid)
Employment type: Full-time
Salary: $150,000 - $180,000 a year
"""

EXAMPLE_UPWORK = """
Project Title: Build a REST API with authentication

Hi — I need a small REST API built using FastAPI or Flask. The API should
allow users to sign up, login, and CRUD operations for a "tasks" resource.

Requirements:
* Python (FastAPI preferred)
* PostgreSQL
* JWT-based authentication
* Docker setup for deployment

Budget: $1,200 fixed price
Duration: 2-3 weeks
This is a remote freelance project.
"""

def demo():
    print("=== Demo: LinkedIn-like job description ===")
    res1 = parse_job_description(EXAMPLE_LINKEDIN, source_name="linkedin_example")
    print(json.dumps(res1, indent=2))

    print("\n=== Demo: Upwork-like job description ===")
    res2 = parse_job_description(EXAMPLE_UPWORK, source_name="upwork_example")
    print(json.dumps(res2, indent=2))

if __name__ == "__main__":
    demo()


=== Demo: LinkedIn-like job description ===
{
  "source": "linkedin_example",
  "title": "Senior Backend Engineer",
  "role_summary": "We are building an industry-first analytics platform that powers\nreal-time decisions for users. As a Senior Backend Engineer,\nyou'll design and implement scalable APIs that serve millions of requests.",
  "responsibilities": [
    "Design and implement microservices in Python",
    "Build high-throughput data pipelines using Kafka and Spark",
    "Write tests and participate in code reviews",
    "Work closely with product and data science teams"
  ],
  "technical_requirements": [
    "python",
    "aws",
    "django",
    "docker",
    "kafka",
    "kubernetes",
    "postgresql",
    "spark"
  ],
  "education": {
    "degree": "Ba",
    "raw": "Ba",
    "required": false
  },
  "experience": {
    "min_years": 5,
    "raw": "5+ years"
  },
  "employment_type": "Full-time",
  "location": "San Francisco, CA",
  "salary": {
    "min": 150000.0,
    "max