In [22]:
# distilbert_job_category_classifier.py
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os


In [2]:
# -------------------------
# CONFIG
# -------------------------
MODEL_NAME = "distilbert-base-uncased"
TEXT_COL = "processed_text"
LABEL_COL = "merged_category"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
SAVE_DIR = "./distilbert_job_model"
os.makedirs(SAVE_DIR, exist_ok=True)

In [3]:
import pandas as pd
df = pd.read_csv("data_cleaned.csv")
print(df.columns.tolist())


['job_id', 'job_title', 'company', 'descriptions', 'State', 'merged_category', 'category', 'subcategory', 'role_clean', 'type_clean', 'salary', 'processed_title+desc', 'processed_text']


In [4]:
# -------------------------
# 1. Load dataset
# -------------------------

df = pd.read_csv("data_cleaned.csv")

# Explicitly set correct columns
TEXT_COL = "processed_text"
LABEL_COL = "merged_category"

# Drop rows missing text or labels
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)

print(f"✅ Loaded {len(df)} job postings, {df[LABEL_COL].nunique()} categories.")
print("Columns:", df.columns.tolist())


✅ Loaded 65878 job postings, 14 categories.
Columns: ['job_id', 'job_title', 'company', 'descriptions', 'State', 'merged_category', 'category', 'subcategory', 'role_clean', 'type_clean', 'salary', 'processed_title+desc', 'processed_text']


In [5]:
# -------------------------
# 2. Encode labels and split dataset
# -------------------------
le = LabelEncoder()
df["label"] = le.fit_transform(df[LABEL_COL])
joblib.dump(le, os.path.join(SAVE_DIR, "label_encoder.pkl"))

# Split 70% train / 20% val / 10% test
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df["label"], random_state=42)
print(f"✅ Split complete — Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


✅ Split complete — Train: 46114, Val: 13176, Test: 6588


In [6]:
# -------------------------
# 3. Dataset class & tokenizer
# -------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

class JobDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        tokens = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_ds = JobDataset(train_df[TEXT_COL].tolist(), train_df["label"].tolist())
val_ds = JobDataset(val_df[TEXT_COL].tolist(), val_df["label"].tolist())
test_ds = JobDataset(test_df[TEXT_COL].tolist(), test_df["label"].tolist())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)


In [7]:
# -------------------------
# 4. Model setup
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = df["label"].nunique()

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
).to(device)

optimizer = AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# -------------------------
# 5. Training & evaluation functions
# -------------------------
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_epoch(model, loader):
    model.eval()
    preds, true_labels = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=mask, labels=labels)
            total_loss += outputs.loss.item()
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return total_loss / len(loader), preds, true_labels


In [25]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU name:", torch.cuda.get_device_name(0))


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA version: 11.8
GPU name: NVIDIA GeForce RTX 3060 Ti


In [None]:
# -------------------------
# 6. Training loop 
# -------------------------
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")
    
    # --- Training ---
    model.train()
    train_loss = 0.0
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        
        # Print progress every 100 steps
        if step % 100 == 0:
            print(f"Step {step}/{len(train_loader)}, Loss: {loss.item():.4f}")

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation loss: {avg_val_loss:.4f}")
    
    # --- Metrics ---
    from sklearn.metrics import classification_report
    print("Validation classification report:")
    print(classification_report(val_labels, val_preds, target_names=le.classes_))
    
    # --- Save best model ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(SAVE_DIR)
        tokenizer.save_pretrained(SAVE_DIR)
        print(f"✅ Saved best model at epoch {epoch+1}")


In [None]:
# -------------------------
# 7. Test evaluation
# -------------------------
print("\nLoading best model for testing...")
model = DistilBertForSequenceClassification.from_pretrained(SAVE_DIR).to(device)
test_loss, test_preds, test_true = eval_epoch(model, test_loader)

print(f"\nTest loss: {test_loss:.4f}")
print(classification_report(test_true, test_preds, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(test_true, test_preds)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - DistilBERT Job Category Classification")
plt.xlabel("Predicted Category")
plt.ylabel("True Category")
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "confusion_matrix.png"))
plt.show()

print("\n✅ Training complete. Model saved to:", SAVE_DIR)

In [29]:
# -------------------------
# CONFIG
# -------------------------
MODEL_NAME = "distilbert-base-uncased"
TEXT_COL = "processed_text"
LABEL_COL = "merged_category"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LR = 2e-5
SAVE_DIR = "./distilbert_job_model"
os.makedirs(SAVE_DIR, exist_ok=True)

In [31]:
# -------------------------
# 6. Training loop (fixed version)
# -------------------------
import torch
from sklearn.metrics import classification_report

best_val_loss = float('inf')

# --- Sanity checks ---
print("Model device:", next(model.parameters()).device)
for g in optimizer.param_groups:
    print("Initial learning rate:", g['lr'])

for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")
    
    # --- Training ---
    model.train()
    train_loss = 0.0

    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        # Move batch to correct device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward + backward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        # ✅ Gradient check (once per 200 steps)
        if step % 200 == 0:
            for name, param in model.named_parameters():
                if param.grad is not None:
                    print(f"Grad check [{name}] mean={param.grad.abs().mean().item():.6f}")
                    break

        # Gradient clipping + optimization
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()  # ✅ must come after optimizer.step()

        train_loss += loss.item()
        
        # Print progress
        if step % 100 == 0:
            print(f"Step {step}/{len(train_loader)}, Loss: {loss.item():.4f}")

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation loss: {avg_val_loss:.4f}")
    print("Validation classification report:")
    print(classification_report(val_labels, val_preds, target_names=le.classes_))
    
    # --- Save best model ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(SAVE_DIR)
        tokenizer.save_pretrained(SAVE_DIR)
        print(f"✅ Saved best model at epoch {epoch+1}")

print("\nTraining complete.")


Model device: cuda:0
Initial learning rate: 0.0

===== Epoch 1/10 =====
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000009
Step 0/2883, Loss: 0.2213
Step 100/2883, Loss: 0.0024
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000004
Step 200/2883, Loss: 0.0029
Step 300/2883, Loss: 0.0214
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000011
Step 400/2883, Loss: 0.0502
Step 500/2883, Loss: 0.2860
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000005
Step 600/2883, Loss: 0.0117
Step 700/2883, Loss: 0.4676
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000003
Step 800/2883, Loss: 0.0029
Step 900/2883, Loss: 0.0108
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000003
Step 1000/2883, Loss: 0.0050
Step 1100/2883, Loss: 0.0033
Grad check [distilbert.embeddings.word_embeddings.weight] mean=0.000004
Step 1200/2883, Loss: 0.0107
Step 1300/2883, Loss: 0.0028
Grad check [distilbert.embeddi

KeyboardInterrupt: 

In [None]:
# -------------------------
# 7. Test evaluation
# -------------------------
print("\nLoading best model for testing...")
model = DistilBertForSequenceClassification.from_pretrained(SAVE_DIR).to(device)
test_loss, test_preds, test_true = eval_epoch(model, test_loader)

print(f"\nTest loss: {test_loss:.4f}")
print(classification_report(test_true, test_preds, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(test_true, test_preds)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - DistilBERT Job Category Classification")
plt.xlabel("Predicted Category")
plt.ylabel("True Category")
plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR, "confusion_matrix.png"))
plt.show()

print("\n✅ Training complete. Model saved to:", SAVE_DIR)