# Gumtree Australia Job Description Classification with DistilBERT
This notebook loads the Gumtree dataset, preprocesses it, tokenizes it using DistilBERT tokenizer, and prepares it for fine-tuning a text classification model.

In [1]:
# Step 1: Install dependencies
!pip install -q transformers datasets scikit-learn pandas

In [2]:
# Step 2: Upload and read the dataset
from google.colab import files
import pandas as pd

uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(filename, encoding='ISO-8859-1')
df = df[['job_description', 'category']].dropna().reset_index(drop=True)
df = df[df['job_description'].str.strip() != '']
df = df[df['category'].str.strip() != '']

Saving Gumtree_australia.csv to Gumtree_australia.csv


In [3]:
# Step 3: Label encoding
label2id = {label: idx for idx, label in enumerate(df['category'].unique())}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['category'].map(label2id)

In [4]:
# Step 4: Convert to Hugging Face Dataset
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['job_description', 'label']])
test_dataset = Dataset.from_pandas(test_df[['job_description', 'label']])
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [5]:
# Step 5: Tokenization
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['job_description'], padding='max_length', truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/6528 [00:00<?, ? examples/s]

Map:   0%|          | 0/1632 [00:00<?, ? examples/s]

In [6]:
# Step 6: Save for later use
import json
tokenized_datasets.save_to_disk("gumtree_tokenized")
with open("label2id.json", "w") as f:
    json.dump(label2id, f)
with open("id2label.json", "w") as f:
    json.dump(id2label, f)

Saving the dataset (0/1 shards):   0%|          | 0/6528 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1632 [00:00<?, ? examples/s]

In [7]:
import pandas as pd
import json

# Load your dataset (adjust filename if needed)
df = pd.read_csv("Gumtree_australia.csv", encoding="ISO-8859-1")

# Drop rows with missing category or job_description
df = df.dropna(subset=["category", "job_description"])

# Create label mappings
label2id = {label: idx for idx, label in enumerate(sorted(df['category'].unique()))}
id2label = {idx: label for label, idx in label2id.items()}

# Map labels to new 'label' column
df["label"] = df["category"].map(label2id)

# Save to JSON
with open("label2id.json", "w") as f:
    json.dump(label2id, f)

with open("id2label.json", "w") as f:
    json.dump(id2label, f)

print("✅ label2id.json and id2label.json created and saved.")


✅ label2id.json and id2label.json created and saved.


In [8]:
import shutil

# Zip the tokenized folder and the label mapping files
shutil.make_archive("gumtree_tokenized_bundle", 'zip', "gumtree_tokenized")

# Include JSON files in the zip
import zipfile

with zipfile.ZipFile("gumtree_tokenized_bundle.zip", 'a') as zipf:
    zipf.write("label2id.json")
    zipf.write("id2label.json")

from google.colab import files
files.download("gumtree_tokenized_bundle.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
!pip install transformers datasets evaluate

from google.colab import files
uploaded = files.upload()  # Upload label2id.json and id2label.json manually

# Load mappings
import json

with open("label2id.json", "r") as f:
    label2id = json.load(f)

with open("id2label.json", "r") as f:
    id2label = json.load(f)



import zipfile

with zipfile.ZipFile("gumtree_tokenized_bundle (1).zip", 'r') as zip_ref:
    zip_ref.extractall()




Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


Saving gumtree_tokenized_bundle (1).zip to gumtree_tokenized_bundle (1).zip
Saving label2id.json to label2id (1).json
Saving id2label.json to id2label (1).json


In [12]:
from datasets import DatasetDict, Dataset, load_from_disk
from transformers import DistilBertTokenizerFast
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load CSV (use correct encoding if needed)
df = pd.read_csv("Gumtree_australia.csv", encoding="ISO-8859-1")

# Keep necessary columns and drop nulls
df = df[['job_description', 'category']].dropna()

# Encode category labels
label2id = {label: idx for idx, label in enumerate(df['category'].unique())}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['category'].map(label2id)

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['job_description', 'label']])
test_dataset = Dataset.from_pandas(test_df[['job_description', 'label']])
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["job_description"], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Save dataset
tokenized_datasets.save_to_disk("gumtree_tokenized")

# Save label maps
with open("label2id.json", "w") as f:
    json.dump(label2id, f)

with open("id2label.json", "w") as f:
    json.dump(id2label, f)


Map:   0%|          | 0/6528 [00:00<?, ? examples/s]

Map:   0%|          | 0/1632 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6528 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1632 [00:00<?, ? examples/s]

In [15]:
from transformers import (
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load label mappings (if not already loaded)
import json
with open("label2id.json", "r") as f:
    label2id = json.load(f)
with open("id2label.json", "r") as f:
    id2label = json.load(f)

# Load model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Define metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    save_strategy="epoch",  # This is safe for most versions
    report_to="none"  # Disable W&B
)


# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 