In [None]:
# Install HuggingFace libraries
!pip install transformers datasets evaluate
!pip install -U datasets  # this prevents local cache errors with datasets

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uni

### Load dataset

In [None]:
# Load the full dataset from HuggingFace. load_dataset combines all files in the
# /train directory (without their headers) into one dataset with just one header.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("lanehale1/airline-queries", data_dir='train', cache_dir=None)
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


query_intent_booking.csv: 0.00B [00:00, ?B/s]

query_intent_general.csv: 0.00B [00:00, ?B/s]

query_intent_status.csv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 1001
    })
})

In [None]:
# Display the different classes ('intents')
raw_datasets['train'][0], raw_datasets['train'][389:391], raw_datasets['train'][673:675], raw_datasets['train'][1000]

({'query': 'Find ORD-ORH seats for 11-14', 'intent': 'booking'},
 {'query': ['Yakima-BIH book the 2nd of Dec flight',
   'What luggage is allowed on my flight?'],
  'intent': ['booking', 'general']},
 {'query': ['when does boarding start?', 'has f9 flight 8170 gotten in'],
  'intent': ['general', 'status']},
 {'query': 'give the status of mq1569', 'intent': 'status'})

In [None]:
# Change 'intent' labels to ClassLabel data type for the datasets library,
# and split the dataset 60/40 for 600 training rows (or 200 each class)
from datasets import ClassLabel, Value

# Cast the 'intent' column to ClassLabel
raw_datasets['train'] = raw_datasets['train'].cast_column('intent', ClassLabel(names=raw_datasets['train'].unique('intent')))

# Stratify tries to split evenly across classes (or ClassLabels)
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.4, seed=42, shuffle=True, stratify_by_column='intent')
raw_datasets

Casting the dataset:   0%|          | 0/1001 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 401
    })
})

In [None]:
# Split the 'test' dataset 40/60 for 240 rows to split into validation and test datasets
eval_dataset = raw_datasets['test'].train_test_split(test_size=0.6, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 160
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 241
    })
})

In [None]:
# Split the test dataset 50/50 for 120 validation rows and 120 test rows (or 40 each class)
eval_dataset = eval_dataset['test'].train_test_split(test_size=0.5, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
})

In [None]:
# Save the validation and test datasets in raw_datasets
raw_datasets['validation'] = eval_dataset['train']
raw_datasets['test'] = eval_dataset['test']
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
})

In [None]:
# Define checkpoint and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["query"], truncation=True)

# Create tokenized_datasets and data_collator
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
# Display tokenized_datasets
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
})

In [None]:
# Display a sample from train dataset
raw_datasets["train"][1], tokenized_datasets["train"][1]

({'query': 'when does VX7375 arrive', 'intent': 2},
 {'query': 'when does VX7375 arrive',
  'intent': 2,
  'input_ids': [101, 2043, 2515, 1058, 2595, 2581, 24434, 2629, 7180, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from validation dataset
raw_datasets["validation"][1], tokenized_datasets["validation"][1]

({'query': 'What floor is the bag carousel at?', 'intent': 1},
 {'query': 'What floor is the bag carousel at?',
  'intent': 1,
  'input_ids': [101, 2054, 2723, 2003, 1996, 4524, 27628, 2012, 1029, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from test dataset
raw_datasets['test'][0], tokenized_datasets['test'][0]

({'query': 'give the status of ha 4929', 'intent': 2},
 {'query': 'give the status of ha 4929',
  'intent': 2,
  'input_ids': [101, 2507, 1996, 3570, 1997, 5292, 4749, 24594, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["query"])
# Rename ClassLabel column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
# Set output type to 'torch'
tokenized_datasets.set_format("torch")
# Display modified datasets
tokenized_datasets["train"].column_names, tokenized_datasets["validation"].column_names, tokenized_datasets["test"].column_names

(['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
# Create train and eval dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
)
len(train_dataloader), len(eval_dataloader)

(38, 8)

In [None]:
# Display the shape of a training batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 16]),
 'token_type_ids': torch.Size([16, 16]),
 'attention_mask': torch.Size([16, 16])}

In [None]:
# Display the shape of a validation batch
for batch in eval_dataloader:
  break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 16]),
 'token_type_ids': torch.Size([16, 16]),
 'attention_mask': torch.Size([16, 16])}

In [None]:
# Define a model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents
model

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
"""
The batch variable is a Python dictionary containing various inputs required by the model.

The **batch syntax unpacks this dictionary, treating each key-value pair as a keyword argument to be
passed to the model's forward method (which is implicitly called when you call model(...) directly).

The double asterisk (**) in outputs = model(**batch) is the dictionary unpacking operator.
"""
# Display batch loss parameter and logits shape
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.0098, grad_fn=<NllLossBackward0>) torch.Size([16, 3])


In [None]:
# Display the batch keys
print(list(batch.keys()))
# Display all batch data
batch

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


{'labels': tensor([0, 1, 2, 2, 1, 2, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0]), 'input_ids': tensor([[  101,  4638,  2065,  1045,  2064,  4875,  2188,  2279,  2733, 20967,
          1011, 28492,  2006,  7397,   102,     0],
        [  101,  2054,  2723,  2003,  1996,  4524, 27628,  2012,  1029,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2003,  1058,  2595, 28906,  2509,  2397,  1029,   102,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2038, 15797,  3462,  2620, 27531,  2620,  5407,  1999,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2129,  2521,  2003,  1996,  2149,  4796,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2106,  2035, 13910,  2937,  2102,  2250,  5818, 17465,  2272,
          1999,   102,     0,     0,     0,     0],
        [  101,  2424,  1037, 13118,  1011,  1052,  5638,  4440,  1019,  1011,
          1017,  1052,  1012,  1049,  1012,   102],
 

In [None]:
# Display all of unpacked batch 'outputs'
outputs

SequenceClassifierOutput(loss=tensor(1.0098, grad_fn=<NllLossBackward0>), logits=tensor([[ 2.5380e-01,  9.4444e-05,  1.6534e-01],
        [ 2.8868e-01,  3.1435e-02,  1.7462e-01],
        [ 2.7544e-01, -3.6894e-02,  2.9336e-01],
        [ 1.2432e-01,  2.1422e-01,  9.9742e-02],
        [ 5.6704e-02,  2.6866e-01,  9.2670e-03],
        [ 3.3641e-01,  2.9974e-02,  1.7907e-01],
        [ 3.3005e-01, -8.3412e-02,  2.6386e-01],
        [-4.1243e-02,  3.0942e-01,  1.3065e-02],
        [ 3.4482e-01, -1.2397e-01,  3.2932e-01],
        [ 4.0391e-01, -1.5650e-02,  1.5558e-01],
        [ 1.5903e-01,  8.1589e-02,  2.0849e-01],
        [ 1.3801e-01,  2.1372e-01,  6.8610e-03],
        [ 2.3050e-01, -1.8385e-02,  2.4648e-01],
        [ 4.3531e-01, -3.2701e-02,  2.7018e-01],
        [ 3.7225e-01, -2.1150e-02,  2.3161e-01],
        [ 3.8672e-01, -6.6477e-02,  2.6344e-01]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Train model with HuggingFace Trainer

In [None]:
import os

!pip install TensorBoard

os.environ['WANDB_PROJECT'] = 'airline-chatbot'  # my W&B project name
os.environ["WANDB_LOG_MODEL"] = "checkpoint"     # log all model checkpoints

path_var = os.environ.get('WANDB_PROJECT')
print(path_var)
print(os.environ['WANDB_LOG_MODEL'])

airline-chatbot
checkpoint


In [None]:
""" Set up training arguments """
import numpy as np
from evaluate import load
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# 1. Load accuracy and f1 metrics
acc_metric = load("accuracy")
f1_metric = load("f1")

# 2. Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}  # Return a dictionary as expected by Trainer

# Get default training arguments to decide what to use
training_args = TrainingArguments()

# Total Training Steps = (Dataset Size / (per_device_train_batch_size * gradient_accumulation_steps)) * num_train_epoch
total_training_steps = (
    len(tokenized_datasets["train"]) /
    (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    * training_args.num_train_epochs
)

[len(tokenized_datasets["train"]),
 training_args.per_device_train_batch_size,
 training_args.gradient_accumulation_steps,
 training_args.num_train_epochs,
 total_training_steps,
 training_args.learning_rate,
 training_args.weight_decay,
 training_args.warmup_ratio,
]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[600, 8, 1, 3.0, 225.0, 5e-05, 0.0, 0.0]

In [None]:
# Round steps per epoch in case division by batch size is fractional
steps_per_epoch = round(len(tokenized_datasets["train"]) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps), 0)
num_epochs = training_args.num_train_epochs
total_training_steps = steps_per_epoch * num_epochs

print(f"Steps per epoch (rounded): {steps_per_epoch} * {num_epochs} train epochs = {total_training_steps} training steps")

Steps per epoch (rounded): 75.0 * 3.0 train epochs = 225.0 training steps


In [None]:
# 3. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    run_name='bert-model',
    eval_strategy="steps",
    eval_steps=15,
    save_steps=75,      # Save a model checkpoint every 75 steps
    logging_steps=5,    # Log metrics every 5 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    report_to=["tensorboard", "wandb"],  # Send logs to Weights & Biases and /runs folder for TensorBoard
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=15,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


In [None]:
# 4. Define a trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

<transformers.trainer.Trainer at 0x7e7a51f2b010>

In [None]:
# 5. Train the model
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlanehale1[0m ([33mlanehale1-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
15,0.9941,0.947295,0.691667,0.686621
30,0.8296,0.714938,0.908333,0.906733
45,0.5168,0.439073,0.975,0.97508
60,0.3013,0.24451,1.0,1.0
75,0.2291,0.130755,1.0,1.0
90,0.0866,0.063672,1.0,1.0
105,0.0442,0.03412,1.0,1.0
120,0.0289,0.019173,1.0,1.0
135,0.0174,0.013619,1.0,1.0
150,0.0148,0.010596,1.0,1.0


[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-75)... Done. 28.0s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-150)... Done. 6.9s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-225)... Done. 18.4s


TrainOutput(global_step=225, training_loss=0.22804273613625103, metrics={'train_runtime': 136.3828, 'train_samples_per_second': 13.198, 'train_steps_per_second': 1.65, 'total_flos': 14027234082624.0, 'train_loss': 0.22804273613625103, 'epoch': 3.0})

In [None]:
!ls -lh results

total 16K
drwxr-xr-x 2 root root 4.0K Jul 15 22:18 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 15 22:19 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 15 22:17 checkpoint-75
drwxr-xr-x 3 root root 4.0K Jul 15 22:17 runs


In [None]:
!ls -lh results/runs

total 4.0K
drwxr-xr-x 2 root root 4.0K Jul 15 22:17 Jul15_22-16-55_82b171da9a61


In [None]:
# Save the best checkpoint
trainer.save_model("best_model_bert")
!ls best_model_bert

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [None]:
# Save the model locally to my machine
from google.colab import files

!zip -r best_model_bert.zip best_model_bert
!zip -r bert_chkpt-225.zip results/checkpoint-225
files.download('best_model_bert.zip')
files.download('bert_chkpt-225.zip')

  adding: best_model_bert/ (stored 0%)
  adding: best_model_bert/training_args.bin (deflated 52%)
  adding: best_model_bert/tokenizer_config.json (deflated 75%)
  adding: best_model_bert/tokenizer.json (deflated 71%)
  adding: best_model_bert/model.safetensors (deflated 7%)
  adding: best_model_bert/config.json (deflated 51%)
  adding: best_model_bert/vocab.txt (deflated 53%)
  adding: best_model_bert/special_tokens_map.json (deflated 42%)
  adding: results/checkpoint-225/ (stored 0%)
  adding: results/checkpoint-225/training_args.bin (deflated 52%)
  adding: results/checkpoint-225/rng_state.pth (deflated 25%)
  adding: results/checkpoint-225/trainer_state.json (deflated 80%)
  adding: results/checkpoint-225/tokenizer_config.json (deflated 75%)
  adding: results/checkpoint-225/tokenizer.json (deflated 71%)
  adding: results/checkpoint-225/optimizer.pt (deflated 28%)
  adding: results/checkpoint-225/scheduler.pt (deflated 56%)
  adding: results/checkpoint-225/model.safetensors (deflated

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls wandb

debug-internal.log  debug.log  latest-run  run-20250715_221736-ohfrqgoc


In [None]:
# Save metrics to wandb
import wandb

wandb.init(project="airline-chatbot")
wandb.save(".wanddb/*")

0,1
eval/accuracy,▁▆▇████████████
eval/f1,▁▆▇████████████
eval/loss,█▆▄▃▂▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▁▁▁▁▂▂█▄▁▁▁▁▁
eval/samples_per_second,██▇███▄▅▁▂▇▇▇▇▇
eval/steps_per_second,██▇███▄▅▁▂▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/grad_norm,▃▄▃▃▄▅▄█▃▃▃▂▂▇▂▁▂▁▁▃▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁

0,1
eval/accuracy,1.0
eval/f1,1.0
eval/loss,0.00696
eval/runtime,0.2408
eval/samples_per_second,498.435
eval/steps_per_second,62.304
total_flos,14027234082624.0
train/epoch,3.0
train/global_step,225.0
train/grad_norm,0.13818




[]

In [None]:
# This doesn't seem to work but wandb already saves the data under project-name/Runs/bert_model
wandb.save(".wanddb/*")



[]

In [None]:
# Display validation results
results = trainer.evaluate(tokenized_datasets["test"])
results

{'eval_loss': 0.008250893093645573,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.3313,
 'eval_samples_per_second': 365.248,
 'eval_steps_per_second': 48.297,
 'epoch': 3.0}

In [None]:
# Display predictions, labels, and metrics
import numpy as np

predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions.predictions, axis=-1)
ground_truth_labels = predictions.label_ids
metrics = predictions.metrics

print(predictions.predictions.shape, predictions.label_ids.shape)
print("Predicted labels:", predicted_labels)
print("Ground truth labels:", ground_truth_labels)
print("Metrics:", metrics)

(121, 3) (121,)
Predicted labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Ground truth labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Metrics: {'test_loss': 0.008250893093645573, 'test_accuracy': 1.0, 'test_f1': 1.0, 'test_runtime': 0.3728, 'test_samples_per_second': 324.548, 'test_steps_per_second': 42.915}


In [None]:
# View TensorBoard
%load_ext tensorboard
%tensorboard --logdir results/runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

### Test model with classification pipeline

In [None]:
# Create a classification pipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Define ID to label mapping
id2label_mapping = {0: 'booking', 1: 'general', 2:'status'}

#model_path = "./results/checkpoint-225/"
model_path = "best_model_bert"

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    id2label=id2label_mapping
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

question_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

question = "How can I book a flight?"
question_classifier(question)

Device set to use cuda:0


[{'label': 'general', 'score': 0.9892827272415161}]

In [None]:
question = "When does UA 504 arrive?"
question_classifier(question)

[{'label': 'status', 'score': 0.9935223460197449}]

In [None]:
question_classifier("Book me a flight for tomorrow night LAX-SEA")

[{'label': 'booking', 'score': 0.9965169429779053}]

In [None]:
question_classifier("what is my name")

[{'label': 'general', 'score': 0.988341748714447}]

In [None]:
question_classifier("reserve dallas chicago next week")

[{'label': 'booking', 'score': 0.9963086247444153}]

In [None]:
# Using best_model_bert classifies 'how late...' wrong. Checkpoint-225 classifies it correctly as 'status'.
question_classifier("how late is united 2")

[{'label': 'general', 'score': 0.9837003350257874}]