# Training a Custom Intent Classification Model

This notebook demostrates fine-tuning a HuggingFace Transformer on a dataset of user messages and intent labels. You can replace the sample dataset with your own CSV file containing `message,label` pairs.

In [None]:
# install required packages (only run once or if you get import errors)
!pip install transformers datasets torch scikit-learn evaluate

# Imports: os and pandas for file handling & data, datasets for making HF dataset,
# evaluate for metrics, transformers for the model/training, torch for tensors
import os
import pandas as pd
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch


Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Load dataset from CSV (you can replace the path with your own file)
# the CSV should contain two columns: `message` and `label`.
# example path below; uncomment the pd.read_csv line when you have a real file.

csv_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'intents.csv')
# df = pd.read_csv(csv_path)

# For demonstration create a small dummy dataframe until you supply real data
# (this allows the rest of the notebook to execute without an actual file)
df = pd.DataFrame({
    'message': ['how do i log in?', 'where are the quizzes?', 'tell me a joke'],
    'label': ['login', 'quizzes', 'misc']
})


df.head()

In [None]:
# Convert pandas dataframe to HuggingFace Dataset and tokenize
# create mapping dictionaries between labels and integer IDs
label_list = df['label'].unique().tolist()
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

df['label_id'] = df['label'].map(label2id)  # add numeric label column

# build HF dataset containing only message and label_id columns
hf_dataset = Dataset.from_pandas(df[['message','label_id']])

# load tokenizer from pretrained model (same base model we'll fine-tune)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess(examples):
    # tokenise each text, pad/truncate to fixed length
    return tokenizer(examples['message'], truncation=True, padding='max_length', max_length=128)

hf_dataset = hf_dataset.map(preprocess, batched=True)



# split into train/test setshf_dataset
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

In [None]:
# Initialize the model and training configuration
# number of distinct intent labels determines the classifier size
num_labels = len(label_list)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# training arguments control learning process and output location
training_args = TrainingArguments(
    output_dir='./models/intent',        # where checkpoints will be saved
    evaluation_strategy='epoch',         # evaluate at end of each epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=2,                  # keep last two checkpoints
)

# set up evaluation metric using evaluate library
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# build trainer object to handle training loop
trainer = Trainer(

    model=model,

    args=training_args,trainer  # display trainer configuration

    train_dataset=hf_dataset['train'],

    eval_dataset=hf_dataset['test'],)
    compute_metrics=compute_metrics,

In [None]:
# Train the model (this may take several minutes depending on data size and hardware)
trainer.train()

# After training, save the fine-tuned model and tokenizer for later inference
trainer.save_model('./models/intent')
tokenizer.save_pretrained('./models/intent')


In [None]:
# Example inference on a new message using the trained model

def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    pred = outputs.logits.argmax(dim=-1).item()
    return id2label[pred]

# try a couple of test sentences to see the predicted intent
print(predict("how do i log in?"))
print(predict("show me resources"))
