# Job Classifier LLM
This notebook was running on Google Colab with GPU

In [None]:
!pip uninstall transformers -y
!pip install transformers --upgrade


Found existing installation: transformers 4.55.0
Uninstalling transformers-4.55.0:
  Successfully uninstalled transformers-4.55.0
Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.55.0


In [None]:
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x

# Load Job Posting Dataset

In [None]:
from datasets import load_dataset

job_dataset = load_dataset("json", data_files="dataset.json")
job_dataset.shape

In [None]:
job_dataset = job_dataset['train'].train_test_split(test_size=0.2)
job_dataset

In [None]:
def combine_text(example):
  return {"text": example['title']+" "+example['company_profile']+" "+example['description']+" "+example['requirements']+" "+example['benefits']}

In [None]:
job_dataset = job_dataset.filter(lambda x: x['title'] and x['company_profile'] and x['description'] and x['requirements'] and x['benefits'])

In [None]:
job_dataset = job_dataset.map(combine_text)
job_dataset

In [None]:
job_dataset['train'][0]['text']

In [None]:
job_dataset = job_dataset.rename_column(
    original_column_name="fraudulent", new_column_name="labels"
)
job_dataset

In [None]:
job_dataset['train'].features

In [None]:
import re
from bs4 import BeautifulSoup
import html

def clean_text(batch):
    cleaned_texts = []
    for t in batch["text"]:
        t = BeautifulSoup(t, "html.parser").get_text()
        t = html.unescape(t)
        t = re.sub(r"#URL_[^#]+#", "", t)
        t = re.sub(r"\s+", " ", t).strip()
        cleaned_texts.append(t)
    return {"text": cleaned_texts}

job_dataset = job_dataset.map(clean_text, batched=True)


In [None]:
job_dataset['train'][0]['text']

In [None]:
# split train to create validation

job_dataset_clean = job_dataset['train'].train_test_split(test_size=0.2, seed=42)
job_dataset_clean['validation'] = job_dataset_clean.pop('test')
job_dataset_clean['test'] = job_dataset['test']

job_dataset_clean

In [None]:
for split, dataset in job_dataset_clean.items():
    dataset.to_json(f"job-post-{split}.jsonl")

Get Back Data

In [None]:
data_files = {
    "train": "job-post-train.jsonl",
    "validation": "job-post-validation.jsonl",
    "test": "job-post-test.jsonl",
}
job_dataset = load_dataset("json", data_files=data_files)

job_dataset

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Tokenize

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = job_dataset.map(
    tokenize_and_split, batched=True
)

In [None]:
tokenized_dataset

Train Model with Only Text Features

In [None]:
to_remove_cols = [col for col in job_dataset['train'].column_names if col not in ['labels', 'text']]
tokenized_dataset = tokenized_dataset.remove_columns(to_remove_cols)
tokenized_dataset


In [None]:
tokenized_dataset['train'].features

Start Fine Tuning

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
id2label = {0: "REAL", 1: "FAKE"}
label2id = {"REAL": 0, "FAKE": 1}

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)
model.to("cuda")
print(torch.cuda.is_available())

In [None]:
import evaluate
import numpy as np
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, confusion_matrix

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
roc_auc_metric = evaluate.load("roc_auc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1_macro = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    prec_macro = precision_metric.compute(predictions=predictions, references=labels, average="macro")
    rec_macro = recall_metric.compute(predictions=predictions, references=labels, average="macro")

    mcc = matthews_corrcoef(labels, predictions)

    return {
        "accuracy": acc["accuracy"],
        "f1_macro": f1_macro["f1"],
        "precision_macro": prec_macro["precision"],
        "recall_macro": rec_macro["recall"],
        "mcc": mcc,
    }


In [None]:
import torch

def compute_class_weights_from_counts(count_class0, count_class1):
    """
    Compute sklearn-style class weights given counts for each class.
    Args:
        count_class0 (int): Number of samples in class 0
        count_class1 (int): Number of samples in class 1
    Returns:
        torch.tensor of shape [2] with weights for [class0, class1]
    """
    total = count_class0 + count_class1
    num_classes = 2
    weights = [
        total / (num_classes * count_class0),  # weight for class 0
        total / (num_classes * count_class1),  # weight for class 1
    ]
    return torch.tensor(weights, dtype=torch.float)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    logging_steps=50,
    save_steps=500,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
import torch
import torch.nn.functional as F
from collections import Counter
from transformers import Trainer

def focal_loss_with_class_weights(logits, labels, class_weights=None, gamma=2.0):
    """
    logits: [batch_size, num_classes]
    labels: [batch_size]
    class_weights: torch.tensor shape [num_classes]
    """
    device = logits.device  # get device from logits (cuda or cpu)
    if class_weights is not None:
        class_weights = class_weights.to(device)  # move class_weights to correct device

    ce_loss = F.cross_entropy(logits, labels, weight=class_weights, reduction="none")
    pt = torch.exp(-ce_loss)  # probability of true class
    focal_term = (1 - pt) ** gamma
    return (focal_term * ce_loss).mean()

class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss



weights = compute_class_weights_from_counts(17589, 855)
weights = torch.tensor(weights, dtype=torch.float)


trainer = CustomLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    loss_fn=lambda logits, labels: focal_loss_with_class_weights(
        logits, labels, class_weights=weights, gamma=2.0
    ),
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  weights = torch.tensor(weights, dtype=torch.float)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro,Mcc
1,0.1812,0.23768,0.958738,0.766675,0.743908,0.795031,0.536509
2,0.1492,0.296379,0.975625,0.829508,0.874892,0.794716,0.664791
3,0.0647,0.238815,0.978095,0.849723,0.888116,0.81882,0.703531


TrainOutput(global_step=19167, training_loss=0.2080575880797317, metrics={'train_runtime': 2702.1275, 'train_samples_per_second': 56.74, 'train_steps_per_second': 7.093, 'total_flos': 5077409156785152.0, 'train_loss': 0.2080575880797317, 'epoch': 3.0})