In [1]:
# The idea is to get a small labeled data to train a classifier to properly parse the job description content

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import numpy as np
import re
import pandas as pd

In [2]:
# ---------- Utilities ----------
def split_sentences_and_bullets(text):
    text = text.replace("â€¢", "-")
    paras = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
    chunks = []
    for p in paras:
        if re.search(r'(^|\n)[\-\*\u2022]\s+', p):
            for ln in [l.strip() for l in p.splitlines() if l.strip()]:
                ln = re.sub(r'^[\-\*\u2022]\s+', '', ln).strip()
                if ln: chunks.append(ln)
        else:
            sents = re.split(r'(?<=[\.\!\?])\s+', p)
            for s in sents:
                s = s.strip()
                if s: chunks.append(s)
    return [c for c in chunks if len(c) > 8]

# ---------- Train ----------
def train_embedding_classifier(X_texts, y_labels, model_name='all-MiniLM-L6-v2', save_path='sbert_jd_clf.joblib'):
    # 1) embed
    embedder = SentenceTransformer(model_name)
    X_emb = embedder.encode(X_texts, show_progress_bar=True)
    # 2) classifier (logistic)
    clf = LogisticRegression(max_iter=1000, class_weight='balanced')
    X_tr, X_val, y_tr, y_val = train_test_split(X_emb, y_labels, test_size=0.2, random_state=42, stratify=y_labels)
    clf.fit(X_tr, y_tr)
    preds = clf.predict(X_val)
    print("Validation report:\n", classification_report(y_val, preds, zero_division=0))
    # save pipeline: embedder + clf
    joblib.dump({'embedder_name': model_name, 'clf': clf}, save_path)
    print("Saved to", save_path)
    return embedder, clf

# ---------- Inference ----------
def load_pipeline(path='sbert_jd_clf.joblib'):
    obj = joblib.load(path)
    embedder = SentenceTransformer(obj['embedder_name'])
    clf = obj['clf']
    return embedder, clf

def predict_chunks(embedder, clf, chunks, proba_threshold=0.30):
    emb = embedder.encode(chunks)
    probs = clf.predict_proba(emb)
    classes = clf.classes_
    out = []
    for i, c in enumerate(chunks):
        row = {classes[j]: float(probs[i, j]) for j in range(len(classes))}
        pred = classes[np.argmax(probs[i])]
        out.append({'text': c, 'pred': pred, 'probs': row})
    return out


In [3]:
# Load data and train the model
df = pd.read_csv('jd_labels_starter.csv')
X, y = df['text'], df['label']

embedder, clf= train_embedding_classifier(X, y)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation report:
                 precision    recall  f1-score   support

     EDUCATION       1.00      1.00      1.00         3
    EXPERIENCE       1.00      1.00      1.00         3
         OTHER       1.00      0.67      0.80         3
RESPONSIBILITY       0.62      0.83      0.71         6
       SUMMARY       0.00      0.00      0.00         2
          TECH       1.00      1.00      1.00         5
         TITLE       0.67      1.00      0.80         2

      accuracy                           0.83        24
     macro avg       0.76      0.79      0.76        24
  weighted avg       0.80      0.83      0.80        24

Saved to sbert_jd_clf.joblib


In [5]:
# Load the model and predict
# test
jd = """Senior Backend Engineer

Responsibilities:
- Design APIs in Python and maintain Kubernetes deployments.
- Design and implement microservices in Python and maintain Kubernetes deployments.
- Write tests and collaborate with frontend teams.

Qualifications:
- Bachelor's degree in CS or equivalent practical experience.
- 3+ years in backend engineering with PostgreSQL and Redis.
Tech: Docker, PostgreSQL, Redis.
"""

chunks = split_sentences_and_bullets(jd)

embedder, clf = load_pipeline()
res = predict_chunks(embedder, clf, chunks)
import json
print(json.dumps(res, indent=2))


[
  {
    "text": "Senior Backend Engineer",
    "pred": "TITLE",
    "probs": {
      "EDUCATION": 0.10102192574394628,
      "EXPERIENCE": 0.17758058638110968,
      "OTHER": 0.09025477002970411,
      "RESPONSIBILITY": 0.09791597765084976,
      "SUMMARY": 0.05951692952150772,
      "TECH": 0.09672447799708607,
      "TITLE": 0.3769853326757965
    }
  },
  {
    "text": "Responsibilities:",
    "pred": "OTHER",
    "probs": {
      "EDUCATION": 0.09668299277753689,
      "EXPERIENCE": 0.07721930771861686,
      "OTHER": 0.2767166644309528,
      "RESPONSIBILITY": 0.19627273898647388,
      "SUMMARY": 0.14517192718864275,
      "TECH": 0.11012198278718716,
      "TITLE": 0.09781438611058964
    }
  },
  {
    "text": "Design APIs in Python and maintain Kubernetes deployments.",
    "pred": "TECH",
    "probs": {
      "EDUCATION": 0.053890908843502665,
      "EXPERIENCE": 0.09294414687275653,
      "OTHER": 0.09392777918933408,
      "RESPONSIBILITY": 0.23932053857025185,
      "SUM