In [1]:
# Install HuggingFace libraries
!pip install pyarrow
!pip install transformers datasets evaluate
!pip install -U datasets  # this prevents local cache errors with datasets

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading datasets-4.2.0-py3-none-any.whl (506 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall

### Load dataset

In [2]:
# Load the full dataset from HuggingFace. load_dataset combines all files in the
# /train directory (without their headers) into one dataset with just one header.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("lanehale1/airline-queries", data_dir='train', cache_dir=None)
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


query_intent_booking_altered.csv: 0.00B [00:00, ?B/s]

query_intent_general_altered.csv: 0.00B [00:00, ?B/s]

query_intent_status_altered.csv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 1431
    })
})

In [3]:
# Display the different classes ('intents')
raw_datasets['train'][509:512], raw_datasets['train'][910:914]

({'query': ['Could you book next month on the 21st St. Augustine to ALO',
   'what luggage can I keep with me',
   'Paducah flights are so cheap right now.'],
  'intent': ['booking', 'general', 'general']},
 {'query': ['When will the YV agents arrive?',
   'where do I pick up my stroller when getting off?',
   'give the status of mq8139',
   "where's the terminal for 9E 9985?"],
  'intent': ['general', 'status', 'status', 'status']})

In [None]:
raw_datasets = load_dataset("lanehale1/airline-queries", data_dir='train', cache_dir=None)

In [4]:
# Change 'intent' labels to ClassLabel data type for the datasets library,
# and split the dataset by class
from datasets import ClassLabel, Value

# Get the unique intents from the dataset
unique_intents = raw_datasets['train'].unique('intent')

# Define custom class names in the order corresponding to the integer labels
custom_class_names = ['booking', 'general', 'status']

# Cast the 'intent' column to ClassLabel with custom names
raw_datasets['train'] = raw_datasets['train'].cast_column('intent', ClassLabel(names=custom_class_names))

# Get the class labels
class_label_names = raw_datasets['train'].features['intent'].names

booking_idx = class_label_names.index('booking')
general_idx = class_label_names.index('general')
status_idx = class_label_names.index('status')

print(booking_idx, general_idx, status_idx)
# Create empty lists to store indices for each class
indices_booking = []
indices_general = []
indices_status = []

# Iterate through the dataset to collect indices for each class
for i, example in enumerate(raw_datasets['train']):
    if example['intent'] == booking_idx:
        indices_booking.append(i)
    elif example['intent'] == general_idx:
        indices_general.append(i)
    elif example['intent'] == status_idx:
        indices_status.append(i)

print(len(indices_booking), len(indices_general), len(indices_status))

# Randomly select the desired number of indices for each class (try 70/15/15 splits)
import random
random.seed(13)
indices_booking = random.sample(indices_booking, 356)
indices_general = random.sample(indices_general, 281)
indices_status = random.sample(indices_status, 364)

# Combine the selected indices for the new training set
train_indices = indices_booking + indices_general + indices_status

# Create the new training dataset
train_dataset = raw_datasets['train'].select(train_indices)

# Get the remaining indices for the test set
all_indices = list(range(len(raw_datasets['train'])))
test_indices = list(set(all_indices) - set(train_indices))

# Create the new test dataset
temp_dataset = raw_datasets['train'].select(test_indices)

train_dataset, temp_dataset

Casting the dataset:   0%|          | 0/1431 [00:00<?, ? examples/s]

0 1 2
510 401 520


(Dataset({
     features: ['query', 'intent'],
     num_rows: 1001
 }),
 Dataset({
     features: ['query', 'intent'],
     num_rows: 430
 }))

In [5]:
temp_indices_booking = []
temp_indices_general = []
temp_indices_status = []

for i, example in enumerate(temp_dataset):
    if example['intent'] == booking_idx:
        temp_indices_booking.append(i)
    elif example['intent'] == general_idx:
        temp_indices_general.append(i)
    elif example['intent'] == status_idx:
        temp_indices_status.append(i)

# Randomly select the desired number of indices for each class
import random
random.seed(13)
temp_indices_booking = random.sample(temp_indices_booking, 154)
temp_indices_general = random.sample(temp_indices_general, 120)
temp_indices_status = random.sample(temp_indices_status, 156)

eval_indices_booking = temp_indices_booking[:77]
eval_indices_general = temp_indices_general[:60]
eval_indices_status = temp_indices_status[:78]

test_indices_booking = temp_indices_booking[77:]
test_indices_general = temp_indices_general[60:]
test_indices_status = temp_indices_status[78:]

print(len(eval_indices_booking), len(eval_indices_general), len(eval_indices_status))
print(len(test_indices_booking), len(test_indices_general), len(test_indices_status))
# Combine the selected indices for the validation set
eval_indices = eval_indices_booking + eval_indices_general + eval_indices_status

# Create the validation dataset
eval_dataset = raw_datasets['train'].select(eval_indices)

# Combine the selected indices for the test set
test_indices = test_indices_booking + test_indices_general + test_indices_status

# Create the test dataset
test_dataset = raw_datasets['train'].select(test_indices)

eval_dataset, test_dataset

77 60 78
77 60 78


(Dataset({
     features: ['query', 'intent'],
     num_rows: 215
 }),
 Dataset({
     features: ['query', 'intent'],
     num_rows: 215
 }))

In [6]:
# Save train, validation, and test datasets in raw_datasets
raw_datasets['train'] = train_dataset
raw_datasets['validation'] = eval_dataset
raw_datasets['test'] = test_dataset
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 1001
    })
    validation: Dataset({
        features: ['query', 'intent'],
        num_rows: 215
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 215
    })
})

In [7]:
# Define checkpoint and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["query"], truncation=True)

# Create tokenized_datasets and data_collator
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

In [8]:
# Display tokenized_datasets
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1001
    })
    validation: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 215
    })
    test: Dataset({
        features: ['query', 'intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 215
    })
})

In [9]:
# Display a sample from train dataset
raw_datasets["train"][1], tokenized_datasets["train"][1]

({'query': 'Book anything PIE - JST on 12/26/25', 'intent': 0},
 {'query': 'Book anything PIE - JST on 12/26/25',
  'intent': 0,
  'input_ids': [101,
   2338,
   2505,
   11345,
   1011,
   1046,
   3367,
   2006,
   2260,
   1013,
   2656,
   1013,
   2423,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [10]:
# Display a sample from validation dataset
raw_datasets["validation"][1], tokenized_datasets["validation"][1]

({'query': 'Find a late flight Butte Garden City the 13th of next month',
  'intent': 0},
 {'query': 'Find a late flight Butte Garden City the 13th of next month',
  'intent': 0,
  'input_ids': [101,
   2424,
   1037,
   2397,
   3462,
   25024,
   3871,
   2103,
   1996,
   6122,
   1997,
   2279,
   3204,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [11]:
# Display a sample from test dataset
raw_datasets['test'][0], tokenized_datasets['test'][0]

({'query': 'Find a OTH-MIA flight 7-26 p.m.', 'intent': 0},
 {'query': 'Find a OTH-MIA flight 7-26 p.m.',
  'intent': 0,
  'input_ids': [101,
   2424,
   1037,
   27178,
   2232,
   1011,
   8764,
   3462,
   1021,
   1011,
   2656,
   1052,
   1012,
   1049,
   1012,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [12]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["query"])
# Rename ClassLabel column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
# Set output type to 'torch'
tokenized_datasets.set_format("torch")
# Display modified datasets
tokenized_datasets["train"].column_names, tokenized_datasets["validation"].column_names, tokenized_datasets["test"].column_names

(['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
# Create train and eval dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
)
len(train_dataloader), len(eval_dataloader)

(63, 14)

In [14]:
# Display the shape of a training batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 15]),
 'token_type_ids': torch.Size([16, 15]),
 'attention_mask': torch.Size([16, 15])}

In [15]:
# Display the shape of a validation batch
for batch in eval_dataloader:
  break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 17]),
 'token_type_ids': torch.Size([16, 17]),
 'attention_mask': torch.Size([16, 17])}

In [16]:
# Define a model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents
model

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
"""
The batch variable is a Python dictionary containing various inputs required by the model.

The **batch syntax unpacks this dictionary, treating each key-value pair as a keyword argument to be
passed to the model's forward method (which is implicitly called when you call model(...) directly).

The double asterisk (**) in outputs = model(**batch) is the dictionary unpacking operator.
"""
# Display batch loss parameter and logits shape
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.1445, grad_fn=<NllLossBackward0>) torch.Size([16, 3])


In [18]:
# Display the batch keys
print(list(batch.keys()))
# Display all batch data
batch

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


{'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'input_ids': tensor([[  101,  2424,  1037,  1048,  3366,  1011,  1061,  3070,  4440,  2005,
          1018,  1011,  2570,  2397,   102,     0,     0],
        [  101,  2424,  1037,  2397,  3462, 25024,  3871,  2103,  1996,  6122,
          1997,  2279,  3204,   102,     0,     0,     0],
        [  101,  2424,  1037,  4540,  1011,  2065,  2361,  3462,  1019,  1011,
          2861,  7610,   102,     0,     0,     0,     0],
        [  101,  2106,  2017,  2424,  2505,  2279,  3204,  2006,  1996,  5940,
          5578,  2000,  2522,  2015,  1029,   102,     0],
        [  101,  2338, 24829,  2050,  2000, 21469,  2232,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  6134,  2250,  6494,  2078,  2358,  2140,  1011,  2030,  2232,
           102,     0,     0,     0,     0,     0,     0],
        [  101,  3914,  2142,  9587,  4179,  1011, 11577,  2005,  9317,   102,
           

In [19]:
# Display all of unpacked batch 'outputs'
outputs

SequenceClassifierOutput(loss=tensor(1.1445, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0573,  0.0647,  0.0512],
        [ 0.0570,  0.0413,  0.1305],
        [ 0.0790,  0.0366,  0.0765],
        [ 0.0869,  0.0883,  0.0924],
        [-0.2381,  0.1290,  0.0513],
        [ 0.0740,  0.1719,  0.0610],
        [ 0.0281,  0.0353, -0.0110],
        [-0.0549,  0.1097,  0.1494],
        [ 0.1174,  0.1425,  0.0107],
        [-0.1447,  0.1397,  0.1470],
        [ 0.0990,  0.0369,  0.0905],
        [-0.0825,  0.1221,  0.1455],
        [ 0.1808,  0.1549,  0.0481],
        [ 0.0853,  0.1020,  0.0541],
        [ 0.1453,  0.1946,  0.0780],
        [-0.0797,  0.0452,  0.1111]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Use HuggingFace Trainer

In [20]:
""" Set up training arguments """
import numpy as np
from evaluate import load
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# 1. Load accuracy and f1 metrics
acc_metric = load("accuracy")
f1_metric = load("f1")

# 2. Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}  # Return a dictionary as expected by Trainer

# Get default training arguments to decide what to use
training_args = TrainingArguments()

# Total Training Steps = (Dataset Size / (per_device_train_batch_size * gradient_accumulation_steps)) * num_train_epoch
total_training_steps = (
    len(tokenized_datasets["train"]) /
    (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    * training_args.num_train_epochs
)

[len(tokenized_datasets["train"]),
 training_args.per_device_train_batch_size,
 training_args.gradient_accumulation_steps,
 training_args.num_train_epochs,
 total_training_steps,
 training_args.learning_rate,
 training_args.weight_decay,
 training_args.warmup_ratio,
]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[1001, 8, 1, 3.0, 375.375, 5e-05, 0.0, 0.0]

In [21]:
# Round steps per epoch in case division by batch size is fractional
steps_per_epoch = round(len(tokenized_datasets["train"]) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps), 0)
num_epochs = training_args.num_train_epochs
total_training_steps = steps_per_epoch * num_epochs

print(f"Steps per epoch (rounded): {steps_per_epoch} * {num_epochs} train epochs = {total_training_steps} training steps")

Steps per epoch (rounded): 125.0 * 3.0 train epochs = 375.0 training steps


In [22]:
# 3. Define training arguments
training_args = TrainingArguments(
    output_dir="./model_results",
    run_name='airline-chatbot-model',
    eval_strategy="steps",
    eval_steps=5,
    save_steps=5,       # Save a model checkpoint every 5 steps
    logging_steps=5,    # Log metrics every 5 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="wandb",  # Send logs to Weights & Biases
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # Keep only the last and best checkpoint
    save_total_limit=2
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=5,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,
fp

In [23]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

In [24]:
# 4. Define a trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

trainer

<transformers.trainer.Trainer at 0x7906700b5d60>

In [25]:
# 5. Train the model
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlanehale1[0m ([33mlanehale1-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
5,1.0959,1.109364,0.35814,0.527397
10,1.0911,1.044319,0.72093,0.837838
15,1.0649,0.95471,0.962791,0.981043
20,1.0513,0.874191,0.981395,0.99061
25,1.0047,0.757064,0.981395,0.99061
30,0.9898,0.734912,0.902326,0.948655
35,0.9028,0.702198,0.790698,0.883117
40,0.8191,0.391456,0.981395,0.99061
45,0.8033,0.322122,0.986047,0.992974
50,0.7371,0.3241,0.976744,0.988235


TrainOutput(global_step=145, training_loss=0.5934398351044491, metrics={'train_runtime': 498.3199, 'train_samples_per_second': 6.026, 'train_steps_per_second': 0.759, 'total_flos': 9507541682394.0, 'train_loss': 0.5934398351044491, 'epoch': 1.1507936507936507})

In [26]:
trainer.save_model("./best_model_found")

In [27]:
!ls

best_model_found  model_results  sample_data  wandb


In [28]:
!ls -lh ./model_results

total 8.0K
drwxr-xr-x 2 root root 4.0K Oct 16 01:56 checkpoint-130
drwxr-xr-x 2 root root 4.0K Oct 16 01:56 checkpoint-145


In [29]:
!ls ./best_model_found

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [30]:
# Save the model locally to my machine
from google.colab import files

!zip -r best_model_found.zip ./best_model_found
files.download('best_model_found.zip')

  adding: best_model_found/ (stored 0%)
  adding: best_model_found/vocab.txt (deflated 53%)
  adding: best_model_found/model.safetensors (deflated 7%)
  adding: best_model_found/training_args.bin (deflated 53%)
  adding: best_model_found/config.json (deflated 51%)
  adding: best_model_found/tokenizer.json (deflated 71%)
  adding: best_model_found/tokenizer_config.json (deflated 75%)
  adding: best_model_found/special_tokens_map.json (deflated 42%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
# Save metrics to wandb
import wandb

wandb.init(project="huggingface")
wandb.save(".wanddb/*")

0,1
eval/accuracy,▁▅███▇▆███▇██████████████████
eval/f1,▁▆███▇▆██████████████████████
eval/loss,██▇▆▆▅▅▃▃▃▃▃▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▂█▂▃▂▃▁▂▃██▂▂▂▂▂▄▃▃▄▁▂▂▃▂▃▃▄
eval/samples_per_second,▇▇▁▇▆▆▅▇▆▆▁▁▇▆▇▇▇▅▅▅▅█▆▇▅▆▅▅▄
eval/steps_per_second,▇▇▁▇▆▆▅▇▆▆▁▁▇▆▇▇▇▅▅▅▅█▆▇▅▆▅▅▄
train/epoch,▁▁▁▁▁▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train/grad_norm,▂▄▄▄▃▄▃▄▇▃▄▄▃▅▃▃█▄▃▆▂▂▇▃▁█▃▂▁
train/learning_rate,▁▂▃▄▅▆▇█████▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆

0,1
eval/accuracy,0.96279
eval/f1,0.98104
eval/loss,0.12676
eval/runtime,0.4189
eval/samples_per_second,513.26
eval/steps_per_second,64.456
total_flos,9507541682394.0
train/epoch,1.15079
train/global_step,145
train/grad_norm,1.03234




[]

In [32]:
results = trainer.evaluate(tokenized_datasets["test"])
results

{'eval_loss': 0.04417383670806885,
 'eval_accuracy': 0.9953488372093023,
 'eval_f1': 0.9976689976689976,
 'eval_runtime': 0.4534,
 'eval_samples_per_second': 474.218,
 'eval_steps_per_second': 59.553,
 'epoch': 1.1507936507936507}

In [33]:
# Specify the path to the checkpoint you want to evaluate
checkpoint_path = "./best_model_found"  # Replace with the actual checkpoint path

# Load the model from the checkpoint
model_from_checkpoint = AutoModelForSequenceClassification.from_pretrained(
    checkpoint_path,
    num_labels=3,
    problem_type="single_label_classification"
)

# Create a new Trainer instance with the loaded model
trainer_from_checkpoint = Trainer(
    model_from_checkpoint,
    training_args,  # You can reuse the existing training_args
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the model from the checkpoint
results_from_checkpoint = trainer_from_checkpoint.evaluate(tokenized_datasets["test"])
results_from_checkpoint

{'eval_loss': 0.04417383670806885,
 'eval_model_preparation_time': 0.0026,
 'eval_accuracy': 0.9953488372093023,
 'eval_f1': 0.9976689976689976,
 'eval_runtime': 0.4449,
 'eval_samples_per_second': 483.292,
 'eval_steps_per_second': 60.693}

In [34]:
# Make predictions
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(215, 3) (215,)


In [36]:
# See predictions, labels, and metrics
import numpy as np

predicted_labels = np.argmax(predictions.predictions, axis=-1)
ground_truth_labels = predictions.label_ids
metrics = predictions.metrics

print("Predicted labels:", predicted_labels)
print("Ground truth labels:", ground_truth_labels)
print("Metrics:")
metrics

Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Ground truth labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Metrics:


{'test_loss': 0.04417383670806885,
 'test_accuracy': 0.9953488372093023,
 'test_f1': 0.9976689976689976,
 'test_runtime': 0.4536,
 'test_samples_per_second': 473.954,
 'test_steps_per_second': 59.52}

### Test model with classification pipeline

In [37]:
# Create a classification pipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Define ID to label mapping
id2label_mapping = {0: 'booking', 1: 'general', 2:'status'}

model_path = "./best_model_found"

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    id2label=id2label_mapping
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

question_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

question = "How can I book a flight?"
question_classifier(question)

Device set to use cuda:0


[{'label': 'general', 'score': 0.821153461933136}]

In [38]:
question = "When does UA 504 arrive?"
question_classifier(question)

[{'label': 'status', 'score': 0.9525623321533203}]

In [39]:
question_classifier("Book me a flight for tomorrow night LAX-SEA")

[{'label': 'booking', 'score': 0.9740308523178101}]

In [40]:
question_classifier("what is my name")

[{'label': 'general', 'score': 0.4874076843261719}]

In [41]:
question_classifier("reserve dallas chicago next week")

[{'label': 'booking', 'score': 0.9545001983642578}]

In [42]:
question_classifier("how late is united 2")

[{'label': 'status', 'score': 0.9309108257293701}]

In [43]:
question_classifier("where is my gate?")

[{'label': 'status', 'score': 0.8358936905860901}]