In [None]:
# Install HuggingFace libraries
!pip install transformers datasets evaluate
!pip install -U datasets  # this prevents local cache errors with datasets

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uni

### Load dataset

In [None]:
# Load the full dataset from HuggingFace. load_dataset combines all files in the
# /train directory (without their headers) into one dataset with just one header.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("lanehale1/airline-queries", data_dir='train', cache_dir=None)
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


query_intent_booking.csv: 0.00B [00:00, ?B/s]

query_intent_general.csv: 0.00B [00:00, ?B/s]

query_intent_status.csv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 1001
    })
})

In [None]:
# Display the different classes ('intents')
raw_datasets['train'][0], raw_datasets['train'][389:391], raw_datasets['train'][673:675], raw_datasets['train'][1000]

({'query': 'Find ORD-ORH seats for 11-14', 'intent': 'booking'},
 {'query': ['Yakima-BIH book the 2nd of Dec flight',
   'What luggage is allowed on my flight?'],
  'intent': ['booking', 'general']},
 {'query': ['when does boarding start?', 'has f9 flight 8170 gotten in'],
  'intent': ['general', 'status']},
 {'query': 'give the status of mq1569', 'intent': 'status'})

In [None]:
# Change 'intent' labels to ClassLabel data type for the datasets library,
# and split the dataset 60/40 for 600 training rows (or 200 each class)
from datasets import ClassLabel, Value

# Cast the 'intent' column to ClassLabel
raw_datasets['train'] = raw_datasets['train'].cast_column('intent', ClassLabel(names=raw_datasets['train'].unique('intent')))

# Stratify tries to split evenly across classes (or ClassLabels)
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.4, seed=42, shuffle=True, stratify_by_column='intent')
raw_datasets

Casting the dataset:   0%|          | 0/1001 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 401
    })
})

In [None]:
# Split the 'test' dataset 40/60 for 240 rows to split into validation and test datasets
eval_dataset = raw_datasets['test'].train_test_split(test_size=0.6, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 160
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 241
    })
})

In [None]:
# Split the test dataset 50/50 for 120 validation rows and 120 test rows (or 40 each class)
eval_dataset = eval_dataset['test'].train_test_split(test_size=0.5, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
})

In [None]:
# Save the validation and test datasets in raw_datasets
raw_datasets['validation'] = eval_dataset['train']
raw_datasets['test'] = eval_dataset['test']
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
})

In [None]:
# Define checkpoint and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["query"], truncation=True)

# Create tokenized_datasets and data_collator
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
# Display tokenized_datasets
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
})

In [None]:
# Display a sample from train dataset
raw_datasets["train"][1], tokenized_datasets["train"][1]

({'query': 'when does VX7375 arrive', 'intent': 2},
 {'query': 'when does VX7375 arrive',
  'intent': 2,
  'input_ids': [101, 2043, 2515, 1058, 2595, 2581, 24434, 2629, 7180, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from validation dataset
raw_datasets["validation"][1], tokenized_datasets["validation"][1]

({'query': 'What floor is the bag carousel at?', 'intent': 1},
 {'query': 'What floor is the bag carousel at?',
  'intent': 1,
  'input_ids': [101, 2054, 2723, 2003, 1996, 4524, 27628, 2012, 1029, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from test dataset
raw_datasets['test'][0], tokenized_datasets['test'][0]

({'query': 'give the status of ha 4929', 'intent': 2},
 {'query': 'give the status of ha 4929',
  'intent': 2,
  'input_ids': [101, 2507, 1996, 3570, 1997, 5292, 4749, 24594, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["query"])
# Rename ClassLabel column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
# Set output type to 'torch'
tokenized_datasets.set_format("torch")
# Display modified datasets
tokenized_datasets["train"].column_names, tokenized_datasets["validation"].column_names, tokenized_datasets["test"].column_names

(['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
# Create train and eval dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
)
len(train_dataloader), len(eval_dataloader)

(38, 8)

In [None]:
# Display the shape of a training batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 16]),
 'token_type_ids': torch.Size([16, 16]),
 'attention_mask': torch.Size([16, 16])}

In [None]:
# Display the shape of a validation batch
for batch in eval_dataloader:
  break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 16]),
 'token_type_ids': torch.Size([16, 16]),
 'attention_mask': torch.Size([16, 16])}

In [None]:
# Define a model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
"""
The batch variable is a Python dictionary containing various inputs required by the model.

The **batch syntax unpacks this dictionary, treating each key-value pair as a keyword argument to be
passed to the model's forward method (which is implicitly called when you call model(...) directly).

The double asterisk (**) in outputs = model(**batch) is the dictionary unpacking operator.
"""
# Display batch loss parameter and logits shape
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.9918, grad_fn=<NllLossBackward0>) torch.Size([16, 3])


In [None]:
# Display the batch keys
print(list(batch.keys()))
# Display all batch data
batch

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


{'labels': tensor([0, 1, 2, 2, 1, 2, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0]), 'input_ids': tensor([[  101,  4638,  2065,  1045,  2064,  4875,  2188,  2279,  2733, 20967,
          1011, 28492,  2006,  7397,   102,     0],
        [  101,  2054,  2723,  2003,  1996,  4524, 27628,  2012,  1029,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2003,  1058,  2595, 28906,  2509,  2397,  1029,   102,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2038, 15797,  3462,  2620, 27531,  2620,  5407,  1999,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2129,  2521,  2003,  1996,  2149,  4796,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2106,  2035, 13910,  2937,  2102,  2250,  5818, 17465,  2272,
          1999,   102,     0,     0,     0,     0],
        [  101,  2424,  1037, 13118,  1011,  1052,  5638,  4440,  1019,  1011,
          1017,  1052,  1012,  1049,  1012,   102],
 

In [None]:
# Display all of unpacked batch 'outputs'
outputs

SequenceClassifierOutput(loss=tensor(0.9918, grad_fn=<NllLossBackward0>), logits=tensor([[-1.7511e-01, -8.6784e-01,  6.7238e-02],
        [-3.8049e-01, -9.7662e-01,  1.1470e-03],
        [-3.9503e-01, -1.1075e+00, -9.1438e-02],
        [-3.2383e-01, -5.9861e-01,  1.7746e-01],
        [-3.1069e-01, -5.9848e-01,  1.8766e-01],
        [-2.8466e-01, -8.4127e-01,  1.5803e-02],
        [-3.3826e-01, -1.0556e+00, -1.3491e-01],
        [-3.1407e-01, -6.0218e-01,  1.8838e-01],
        [-3.2804e-01, -1.1861e+00, -8.8705e-02],
        [-1.6174e-01, -1.0207e+00, -2.9589e-01],
        [-3.6478e-01, -7.6751e-01,  1.1319e-01],
        [-2.9808e-01, -5.6135e-01,  1.3090e-01],
        [-3.4530e-01, -9.4181e-01, -2.2653e-02],
        [-3.1895e-01, -1.1402e+00, -1.0373e-01],
        [-1.8144e-01, -1.0199e+00, -1.8427e-01],
        [-2.9808e-01, -1.0485e+00, -1.6419e-01]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Use HuggingFace Trainer

In [None]:
""" Set up training arguments """
import numpy as np
from evaluate import load
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# 1. Load accuracy and f1 metrics
acc_metric = load("accuracy")
f1_metric = load("f1")

# 2. Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}  # Return a dictionary as expected by Trainer

# Get default training arguments to decide what to use
training_args = TrainingArguments()

# Total Training Steps = (Dataset Size / (per_device_train_batch_size * gradient_accumulation_steps)) * num_train_epoch
total_training_steps = (
    len(tokenized_datasets["train"]) /
    (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    * training_args.num_train_epochs
)

[len(tokenized_datasets["train"]),
 training_args.per_device_train_batch_size,
 training_args.gradient_accumulation_steps,
 training_args.num_train_epochs,
 total_training_steps,
 training_args.learning_rate,
 training_args.weight_decay,
 training_args.warmup_ratio,
]

[600, 8, 1, 3.0, 225.0, 5e-05, 0.0, 0.0]

In [None]:
# Round steps per epoch in case division by batch size is fractional
steps_per_epoch = round(len(tokenized_datasets["train"]) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps), 0)
num_epochs = training_args.num_train_epochs
total_training_steps = steps_per_epoch * num_epochs

print(f"Steps per epoch (rounded): {steps_per_epoch} * {num_epochs} train epochs = {total_training_steps} training steps")

Steps per epoch (rounded): 75.0 * 3.0 train epochs = 225.0 training steps


In [None]:
# 3. Define training arguments
training_args = TrainingArguments(
    output_dir="./model_results",
    run_name='airline-chatbot-model',
    eval_strategy="steps",
    eval_steps=15,
    save_steps=75,      # Save a model checkpoint every 75 steps
    logging_steps=5,    # Log metrics every 5 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="wandb",  # Send logs to Weights & Biases
)

training_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=15,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


In [None]:
# 4. Define a trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

<transformers.trainer.Trainer at 0x7a558b557b50>

In [None]:
# 5. Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
15,0.945,0.962083,0.608333,0.531134
30,0.7786,0.690853,0.866667,0.86354
45,0.5094,0.482219,0.925,0.923688
60,0.3428,0.312054,0.966667,0.96655
75,0.2675,0.176116,0.975,0.974886
90,0.1509,0.09973,0.983333,0.983289
105,0.0672,0.059976,0.991667,0.991673
120,0.0407,0.032811,1.0,1.0
135,0.0242,0.025377,1.0,1.0
150,0.0176,0.016389,1.0,1.0


TrainOutput(global_step=225, training_loss=0.23669729444715712, metrics={'train_runtime': 835.6267, 'train_samples_per_second': 2.154, 'train_steps_per_second': 0.269, 'total_flos': 14027234082624.0, 'train_loss': 0.23669729444715712, 'epoch': 3.0})

In [None]:
!ls

model_results  sample_data  wandb


In [None]:
!ls -lh ./model_results

total 12K
drwxr-xr-x 2 root root 4.0K Jul 14 17:17 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 14 17:21 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 14 17:12 checkpoint-75


In [None]:
# Save the trained model
trainer.save_model("./model_results")
!ls -lh ./model_results

total 419M
drwxr-xr-x 2 root root 4.0K Jul 14 17:17 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 14 17:21 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 14 17:12 checkpoint-75
-rw-r--r-- 1 root root  841 Jul 14 17:22 config.json
-rw-r--r-- 1 root root 418M Jul 14 17:22 model.safetensors
-rw-r--r-- 1 root root  125 Jul 14 17:22 special_tokens_map.json
-rw-r--r-- 1 root root 1.2K Jul 14 17:22 tokenizer_config.json
-rw-r--r-- 1 root root 695K Jul 14 17:22 tokenizer.json
-rw-r--r-- 1 root root 5.2K Jul 14 17:22 training_args.bin
-rw-r--r-- 1 root root 227K Jul 14 17:22 vocab.txt


In [None]:
!ls ./model_results/checkpoint-225/

config.json	   scheduler.pt		    trainer_state.json
model.safetensors  special_tokens_map.json  training_args.bin
optimizer.pt	   tokenizer_config.json    vocab.txt
rng_state.pth	   tokenizer.json


In [None]:
# Save the model locally to my machine
from google.colab import files

!zip -r model_results.zip ./model_results
files.download('model_results.zip')

  adding: model_results/ (stored 0%)
  adding: model_results/checkpoint-75/ (stored 0%)
  adding: model_results/checkpoint-75/rng_state.pth (deflated 24%)
  adding: model_results/checkpoint-75/trainer_state.json (deflated 75%)
  adding: model_results/checkpoint-75/model.safetensors (deflated 7%)
  adding: model_results/checkpoint-75/scheduler.pt (deflated 56%)
  adding: model_results/checkpoint-75/tokenizer.json (deflated 71%)
  adding: model_results/checkpoint-75/config.json (deflated 51%)
  adding: model_results/checkpoint-75/tokenizer_config.json (deflated 75%)
  adding: model_results/checkpoint-75/vocab.txt (deflated 53%)
  adding: model_results/checkpoint-75/special_tokens_map.json (deflated 42%)
  adding: model_results/checkpoint-75/training_args.bin (deflated 51%)
  adding: model_results/checkpoint-75/optimizer.pt (deflated 28%)
  adding: model_results/model.safetensors (deflated 7%)
  adding: model_results/checkpoint-150/ (stored 0%)
  adding: model_results/checkpoint-150/rng_s

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls wandb

debug-internal.log  debug.log  latest-run  run-20250714_165041-ill9r3ep


In [None]:
# Save metrics to wandb
import wandb

wandb.init(project="huggingface")
wandb.save(".wanddb/*")

0,1
eval/accuracy,▁▃▂▆▇▇███████████
eval/f1,▁▃▁▆▇████████████
eval/loss,█▆█▆▄▃▂▂▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▂█▂▂▂▁▂▁▁▁▂▂▁▁▁
eval/samples_per_second,██▆▁▆▅▆▇▆▇█▇▆▆▇██
eval/steps_per_second,██▆▁▆▅▆▇▆▇█▇▆▆▇██
train/epoch,▁▁▁▂▂▁▁▁▁▁▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇█████
train/global_step,▁▁▁▂▂▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇█████
train/grad_norm,▅▃▄▄▄▆▅▅▄▄▄▅▅█▅▃▄▆▂▆▂▂▂▂▂▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▇██▇▂▄▅▇██▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁

0,1
eval/accuracy,1.0
eval/f1,1.0
eval/loss,0.01
eval/runtime,5.998
eval/samples_per_second,20.007
eval/steps_per_second,2.501
total_flos,14027234082624.0
train/epoch,3.0
train/global_step,225.0
train/grad_norm,0.17075




[]

In [None]:
# This doesn't seem to work but wandb already saves the data under project-name/Runs/model_results
wandb.save(".wanddb/*")



[]

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
results

{'eval_loss': 0.012205369770526886,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 15.6673,
 'eval_samples_per_second': 7.723,
 'eval_steps_per_second': 1.021,
 'epoch': 3.0}

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(121, 3) (121,)


In [None]:
# See predictions, labels, and metrics
import numpy as np

predicted_labels = np.argmax(predictions.predictions, axis=-1)
ground_truth_labels = predictions.label_ids
metrics = predictions.metrics

print("Predicted labels:", predicted_labels)
print("Ground truth labels:", ground_truth_labels)
print("Metrics:", metrics)

Predicted labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Ground truth labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Metrics: {'test_loss': 0.012205369770526886, 'test_accuracy': 1.0, 'test_f1': 1.0, 'test_runtime': 8.3246, 'test_samples_per_second': 14.535, 'test_steps_per_second': 1.922}


### Test model with classification pipeline

In [None]:
# Create a classification pipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Define ID to label mapping
id2label_mapping = {0: 'booking', 1: 'general', 2:'status'}

model_path = "./model_results/checkpoint-225/"

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    id2label=id2label_mapping
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

question_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

question = "How can I book a flight?"
question_classifier(question)

Device set to use cpu


[{'label': 'general', 'score': 0.9846746921539307}]

In [None]:
question = "When does UA 504 arrive?"
question_classifier(question)

[{'label': 'status', 'score': 0.9943144917488098}]

In [None]:
question_classifier("Book me a flight for tomorrow night LAX-SEA")

[{'label': 'booking', 'score': 0.9957055449485779}]

In [None]:
question_classifier("what is my name")

[{'label': 'general', 'score': 0.9770390391349792}]

In [None]:
question_classifier("reserve dallas chicago next week")

[{'label': 'booking', 'score': 0.9952319264411926}]

In [None]:
question_classifier("how late is united 2")

[{'label': 'status', 'score': 0.9124318361282349}]