In [None]:
# Install HuggingFace libraries
!pip install transformers datasets evaluate
!pip install -U datasets  # this prevents local cache errors with datasets

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting unin

### Load dataset

In [None]:
# Load the full dataset from HuggingFace. load_dataset combines all files in the
# /train directory (without their headers) into one dataset with just one header.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("lanehale1/airline-queries", data_dir='train', cache_dir=None)
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


query_intent_booking.csv: 0.00B [00:00, ?B/s]

query_intent_general.csv: 0.00B [00:00, ?B/s]

query_intent_status.csv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 1001
    })
})

In [None]:
# Display the different classes ('intents')
raw_datasets['train'][0], raw_datasets['train'][389:391], raw_datasets['train'][673:675], raw_datasets['train'][1000]

({'query': 'Find ORD-ORH seats for 11-14', 'intent': 'booking'},
 {'query': ['Yakima-BIH book the 2nd of Dec flight',
   'What luggage is allowed on my flight?'],
  'intent': ['booking', 'general']},
 {'query': ['when does boarding start?', 'has f9 flight 8170 gotten in'],
  'intent': ['general', 'status']},
 {'query': 'give the status of mq1569', 'intent': 'status'})

In [None]:
# Change 'intent' labels to ClassLabel data type for the datasets library,
# and split the dataset 60/40 for 600 training rows (or 200 each class)
from datasets import ClassLabel, Value

# Cast the 'intent' column to ClassLabel
raw_datasets['train'] = raw_datasets['train'].cast_column('intent', ClassLabel(names=raw_datasets['train'].unique('intent')))

# Stratify tries to split evenly across classes (or ClassLabels)
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.4, seed=42, shuffle=True, stratify_by_column='intent')
raw_datasets

Casting the dataset:   0%|          | 0/1001 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 401
    })
})

In [None]:
# Split the 'test' dataset 40/60 for 240 rows to split into validation and test datasets
eval_dataset = raw_datasets['test'].train_test_split(test_size=0.6, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 160
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 241
    })
})

In [None]:
# Split the test dataset 50/50 for 120 validation rows and 120 test rows (or 40 each class)
eval_dataset = eval_dataset['test'].train_test_split(test_size=0.5, seed=42, shuffle=True, stratify_by_column='intent')
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
})

In [None]:
# Save the validation and test datasets in raw_datasets
raw_datasets['validation'] = eval_dataset['train']
raw_datasets['test'] = eval_dataset['test']
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent'],
        num_rows: 120
    })
})

In [None]:
# Define checkpoint and tokenizer, create tokenized_datasets and a data_collator
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["query"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
# Display tokenized_datasets
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['query', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 600
    })
    test: Dataset({
        features: ['query', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['query', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 120
    })
})

In [None]:
# Display a sample from train dataset
raw_datasets["train"][1], tokenized_datasets["train"][1]

({'query': 'when does VX7375 arrive', 'intent': 2},
 {'query': 'when does VX7375 arrive',
  'intent': 2,
  'input_ids': [101, 2043, 2515, 1058, 2595, 2581, 24434, 2629, 7180, 102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from validation dataset
raw_datasets["validation"][1], tokenized_datasets["validation"][1]

({'query': 'What floor is the bag carousel at?', 'intent': 1},
 {'query': 'What floor is the bag carousel at?',
  'intent': 1,
  'input_ids': [101, 2054, 2723, 2003, 1996, 4524, 27628, 2012, 1029, 102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Display a sample from test dataset
raw_datasets['test'][0], tokenized_datasets['test'][0]

({'query': 'give the status of ha 4929', 'intent': 2},
 {'query': 'give the status of ha 4929',
  'intent': 2,
  'input_ids': [101, 2507, 1996, 3570, 1997, 5292, 4749, 24594, 102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["query"])
# Rename ClassLabel column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
# Set output type to 'torch'
tokenized_datasets.set_format("torch")
# Display modified datasets
tokenized_datasets["train"].column_names, tokenized_datasets["validation"].column_names, tokenized_datasets["test"].column_names

(['labels', 'input_ids', 'attention_mask'],
 ['labels', 'input_ids', 'attention_mask'],
 ['labels', 'input_ids', 'attention_mask'])

In [None]:
# Create train and eval dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
)
len(train_dataloader), len(eval_dataloader)

(38, 8)

In [None]:
# Display the shape of a training batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 18]),
 'attention_mask': torch.Size([16, 18])}

In [None]:
# Display the shape of a validation batch
for batch in eval_dataloader:
  break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 16]),
 'attention_mask': torch.Size([16, 16])}

In [None]:
# Define a model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents
model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
"""
The batch variable is a Python dictionary containing various inputs required by the model.

The **batch syntax unpacks this dictionary, treating each key-value pair as a keyword argument to be
passed to the model's forward method (which is implicitly called when you call model(...) directly).

The double asterisk (**) in outputs = model(**batch) is the dictionary unpacking operator.
"""
# Display batch loss parameter and logits shape
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.1481, grad_fn=<NllLossBackward0>) torch.Size([16, 3])


In [None]:
# Display the batch keys
print(list(batch.keys()))
# Display all of batch data
batch

['labels', 'input_ids', 'attention_mask']


{'labels': tensor([0, 1, 2, 2, 1, 2, 0, 1, 0, 0, 2, 2, 2, 0, 0, 0]), 'input_ids': tensor([[  101,  4638,  2065,  1045,  2064,  4875,  2188,  2279,  2733, 20967,
          1011, 28492,  2006,  7397,   102,     0],
        [  101,  2054,  2723,  2003,  1996,  4524, 27628,  2012,  1029,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2003,  1058,  2595, 28906,  2509,  2397,  1029,   102,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2038, 15797,  3462,  2620, 27531,  2620,  5407,  1999,   102,
             0,     0,     0,     0,     0,     0],
        [  101,  2129,  2521,  2003,  1996,  2149,  4796,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2106,  2035, 13910,  2937,  2102,  2250,  5818, 17465,  2272,
          1999,   102,     0,     0,     0,     0],
        [  101,  2424,  1037, 13118,  1011,  1052,  5638,  4440,  1019,  1011,
          1017,  1052,  1012,  1049,  1012,   102],
 

In [None]:
# Display all of unpacked batch 'outputs'
outputs

SequenceClassifierOutput(loss=tensor(1.1481, grad_fn=<NllLossBackward0>), logits=tensor([[-0.1909,  0.0617, -0.0034],
        [-0.2141,  0.1450,  0.0431],
        [-0.1919,  0.1042,  0.0024],
        [-0.1712,  0.0800, -0.0525],
        [-0.2171,  0.1143,  0.0049],
        [-0.2299,  0.0949, -0.0466],
        [-0.2617,  0.1243,  0.0182],
        [-0.2186,  0.0948,  0.0073],
        [-0.2663,  0.0847, -0.0178],
        [-0.2363,  0.0980, -0.0448],
        [-0.2176,  0.0990, -0.0321],
        [-0.2445,  0.0647, -0.0229],
        [-0.2552,  0.1039, -0.0034],
        [-0.1799,  0.0612, -0.0495],
        [-0.2326,  0.0982, -0.0242],
        [-0.2102,  0.1215,  0.0257]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Train model with HuggingFace Trainer

In [None]:
import os

!pip install TensorBoard

os.environ['WANDB_PROJECT'] = 'airline-chatbot'  # my W&B project name
os.environ["WANDB_LOG_MODEL"] = "checkpoint"     # log all model checkpoints

path_var = os.environ.get('WANDB_PROJECT')
print(path_var)
print(os.environ['WANDB_LOG_MODEL'])

airline-chatbot
checkpoint


In [None]:
""" Set up training arguments """
import numpy as np
from evaluate import load
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# 1. Load accuracy and f1 metrics
acc_metric = load("accuracy")
f1_metric = load("f1")

# 2. Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}  # Return a dictionary as expected by Trainer

# Get default training arguments to decide what to use
training_args = TrainingArguments()

# Total Training Steps = (Dataset Size / (per_device_train_batch_size * gradient_accumulation_steps)) * num_train_epoch
total_training_steps = (
    len(tokenized_datasets["train"]) /
    (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    * training_args.num_train_epochs
)

[len(tokenized_datasets["train"]),
 training_args.per_device_train_batch_size,
 training_args.gradient_accumulation_steps,
 training_args.num_train_epochs,
 total_training_steps,
 training_args.learning_rate,
 training_args.weight_decay,
 training_args.warmup_ratio,
]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[600, 8, 1, 3.0, 225.0, 5e-05, 0.0, 0.0]

In [None]:
# Round steps per epoch in case division by batch size is fractional
steps_per_epoch = round(len(tokenized_datasets["train"]) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps), 0)
num_epochs = training_args.num_train_epochs
total_training_steps = steps_per_epoch * num_epochs

print(f"Steps per epoch (rounded): {steps_per_epoch} * {num_epochs} train epochs = {total_training_steps} training steps")

Steps per epoch (rounded): 75.0 * 3.0 train epochs = 225.0 training steps


In [None]:
# 3. Define training arguments
training_args = TrainingArguments(
    output_dir="./results_1",
    run_name="by_steps_1",  # Name for wandb
    eval_strategy="steps",
    eval_steps=25,    # Validate every 25 steps
    save_steps=75,    # Save a model checkpoint every 75 steps
    logging_steps=5,  # Log metrics every 5 steps
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    #metric_for_best_model="accuracy",   # default is evaluation loss
    #greater_is_better=True,             # default is False
    report_to=["tensorboard", "wandb"],  # Send logs to Weights & Biases and /runs folder for TensorBoard
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=25,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


In [None]:
# 4. Define a trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

<transformers.trainer.Trainer at 0x7fa3d06dba10>

In [None]:
# 5. Train the model
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlanehale1[0m ([33mlanehale1-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.3784,0.311708,0.991667,0.991673
50,0.0441,0.021214,1.0,1.0
75,0.0147,0.005517,1.0,1.0
100,0.0053,0.003437,1.0,1.0
125,0.0031,0.002422,1.0,1.0
150,0.0025,0.00196,1.0,1.0
175,0.0021,0.001718,1.0,1.0
200,0.0019,0.001593,1.0,1.0
225,0.0019,0.001552,1.0,1.0


[34m[1mwandb[0m: Adding directory to artifact (./results_1/checkpoint-75)... Done. 11.7s
[34m[1mwandb[0m: Adding directory to artifact (./results_1/checkpoint-150)... Done. 19.5s
[34m[1mwandb[0m: Adding directory to artifact (./results_1/checkpoint-225)... Done. 10.7s


TrainOutput(global_step=225, training_loss=0.10803024930258592, metrics={'train_runtime': 262.9696, 'train_samples_per_second': 6.845, 'train_steps_per_second': 0.856, 'total_flos': 7062294134592.0, 'train_loss': 0.10803024930258592, 'epoch': 3.0})

In [None]:
!ls -lh results_1

total 16K
drwxr-xr-x 2 root root 4.0K Jul 15 23:34 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 15 23:34 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 15 23:33 checkpoint-75
drwxr-xr-x 3 root root 4.0K Jul 15 23:30 runs


In [None]:
!ls -lh results_1/runs

total 4.0K
drwxr-xr-x 2 root root 4.0K Jul 15 23:30 Jul15_23-30-43_80c0173f2f14


In [None]:
# Save the best checkpoint
trainer.save_model("best_model_1")
!ls best_model_1

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [None]:
# View TensorBoard
%load_ext tensorboard
%tensorboard --logdir results_1/runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

### Train model with more training args

In [None]:
# Add more training arguments: learning_rate, weight_decay, warmup_ratio
training_args = TrainingArguments(
    output_dir="./results_2",
    run_name="by_steps_2",  # Name for wandb
    eval_strategy="steps",
    eval_steps=25,
    save_steps=75,
    logging_steps=5,    # Log metrics every 5 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    report_to=["tensorboard", "wandb"],  # Send logs to Weights & Biases and /runs folder for TensorBoard
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=25,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


In [None]:
# Define a new model
model_2 = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents

# Update the trainer
from transformers import Trainer

trainer = Trainer(
    model_2,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<transformers.trainer.Trainer at 0x7fa3b92f6550>

In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
25,0.9697,0.931168,0.675,0.566299
50,0.5004,0.396813,0.966667,0.966494
75,0.1406,0.100926,1.0,1.0
100,0.0337,0.029105,1.0,1.0
125,0.0379,0.017106,1.0,1.0
150,0.0159,0.012849,1.0,1.0
175,0.0113,0.010749,1.0,1.0
200,0.0117,0.009722,1.0,1.0
225,0.01,0.009409,1.0,1.0


[34m[1mwandb[0m: Adding directory to artifact (./results_2/checkpoint-75)... Done. 12.9s
[34m[1mwandb[0m: Adding directory to artifact (./results_2/checkpoint-150)... Done. 19.1s
[34m[1mwandb[0m: Adding directory to artifact (./results_2/checkpoint-225)... Done. 25.0s


TrainOutput(global_step=225, training_loss=0.2423896636731095, metrics={'train_runtime': 91.6515, 'train_samples_per_second': 19.64, 'train_steps_per_second': 2.455, 'total_flos': 7062294134592.0, 'train_loss': 0.2423896636731095, 'epoch': 3.0})

In [None]:
# Display validation results
results = trainer.evaluate(tokenized_datasets["test"])
results

{'eval_loss': 0.009830350056290627,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.2475,
 'eval_samples_per_second': 488.982,
 'eval_steps_per_second': 64.659,
 'epoch': 3.0}

In [None]:
# Display predictions, labels, and metrics
import numpy as np

predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions.predictions, axis=-1)
ground_truth_labels = predictions.label_ids
metrics = predictions.metrics

print("Predicted labels:", predicted_labels)
print("Ground truth labels:", ground_truth_labels)
print("Metrics:", metrics)

Predicted labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Ground truth labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Metrics: {'test_loss': 0.009830350056290627, 'test_accuracy': 1.0, 'test_f1': 1.0, 'test_runtime': 0.1946, 'test_samples_per_second': 621.713, 'test_steps_per_second': 82.21}


In [None]:
!ls -lh results_2

total 16K
drwxr-xr-x 2 root root 4.0K Jul 15 23:36 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 15 23:36 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 15 23:35 checkpoint-75
drwxr-xr-x 3 root root 4.0K Jul 15 23:35 runs


In [None]:
!ls -lh results_2/runs

total 4.0K
drwxr-xr-x 2 root root 4.0K Jul 15 23:37 Jul15_23-35-33_80c0173f2f14


In [None]:
# Save the best checkpoint
trainer.save_model("best_model_2")
!ls best_model_2

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [None]:
# View TensorBoard
%tensorboard --logdir results_2/runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

### Train model with more validation

In [None]:
# Alter eval_steps to validate more often
training_args = TrainingArguments(
    output_dir="./results_3",
    run_name="by_steps_3",  # Name for wandb
    eval_strategy="steps",
    eval_steps=15,      # Validate every 15 steps
    save_steps=75,      # Save a model checkpoint every 75 steps
    logging_steps=5,    # Log metrics every 5 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    report_to=["tensorboard", "wandb"],  # Send logs to Weights & Biases and /runs folder for TensorBoard
)

training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=15,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


In [None]:
# Define a new model
model_3 = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents

# Update the trainer
from transformers import Trainer

trainer = Trainer(
    model_3,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<transformers.trainer.Trainer at 0x7fa3bca554d0>

In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
15,1.0827,1.066954,0.425,0.295767
30,1.0165,0.887416,0.741667,0.67991
45,0.6468,0.593085,0.941667,0.940747
60,0.497,0.385771,0.975,0.974882
75,0.339,0.261615,0.991667,0.991656
90,0.2155,0.131506,1.0,1.0
105,0.1334,0.078985,0.983333,0.983218
120,0.0483,0.036293,1.0,1.0
135,0.0337,0.028468,0.991667,0.991673
150,0.0228,0.027871,0.991667,0.991673


[34m[1mwandb[0m: Adding directory to artifact (./results_3/checkpoint-75)... Done. 16.8s
[34m[1mwandb[0m: Adding directory to artifact (./results_3/checkpoint-150)... Done. 11.7s
[34m[1mwandb[0m: Adding directory to artifact (./results_3/checkpoint-225)... Done. 11.6s


TrainOutput(global_step=225, training_loss=0.2892832096748882, metrics={'train_runtime': 111.1256, 'train_samples_per_second': 16.198, 'train_steps_per_second': 2.025, 'total_flos': 7062294134592.0, 'train_loss': 0.2892832096748882, 'epoch': 3.0})

In [None]:
# Display validation results
results_3 = trainer.evaluate(tokenized_datasets["test"])
results_3

{'eval_loss': 0.017555104568600655,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.223,
 'eval_samples_per_second': 542.603,
 'eval_steps_per_second': 71.749,
 'epoch': 3.0}

In [None]:
# Display predictions, labels, and metrics
import numpy as np

predictions_3 = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions_3.predictions, axis=-1)
ground_truth_labels = predictions_3.label_ids
metrics = predictions_3.metrics

print("Predicted labels:", predicted_labels)
print("Ground truth labels:", ground_truth_labels)
print("Metrics:", metrics)

Predicted labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Ground truth labels: [2 0 2 1 0 0 1 0 0 1 2 0 0 2 2 2 0 0 1 1 1 0 1 1 2 0 2 1 0 2 0 0 1 0 1 0 1
 2 2 2 0 0 1 1 2 0 0 0 0 1 1 1 2 2 2 2 2 2 2 1 2 0 0 0 0 1 0 1 0 0 0 0 2 0
 2 1 1 2 1 0 0 0 2 1 0 2 2 1 1 0 2 0 1 0 2 2 0 2 1 0 2 2 2 2 0 1 0 2 1 1 0
 1 0 2 2 2 0 1 1 0 2]
Metrics: {'test_loss': 0.017555104568600655, 'test_accuracy': 1.0, 'test_f1': 1.0, 'test_runtime': 0.2742, 'test_samples_per_second': 441.286, 'test_steps_per_second': 58.352}


In [None]:
!ls -lh results_3

total 16K
drwxr-xr-x 2 root root 4.0K Jul 15 23:38 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 15 23:39 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 15 23:37 checkpoint-75
drwxr-xr-x 3 root root 4.0K Jul 15 23:37 runs


In [None]:
!ls -lh results_3/runs

total 4.0K
drwxr-xr-x 2 root root 4.0K Jul 15 23:39 Jul15_23-37-32_80c0173f2f14


In [None]:
# Save the best checkpoint
trainer.save_model("best_model_3")
!ls best_model_3

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [None]:
# View TensorBoard
%tensorboard --logdir results_3/runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

### Train model evaluating by epoch

In [None]:
# Alter training arguments to eval_strategy by epoch
training_args_by_epoch = TrainingArguments(
    output_dir="./results_4",
    run_name="by_epoch",    # Name for wandb
    eval_strategy="epoch",
    save_strategy="epoch",  # Match save_strategy to eval_strategy when using load_best_model_at_end
    logging_steps=15,       # Log metrics every 15 steps
    learning_rate=2e-5,
    weight_decay=0.01,  # Weight decay (also known as L2 regularization) is a regularization technique that penalizes large weights in the model. It essentially adds a term to the loss function that is proportional to the square of the weights, encouraging the model to learn smaller, more generalized weights.
    warmup_ratio=0.1,   # Warmup refers to a strategy where the learning rate gradually increases from a very small value (often close to zero) to the initial learning rate over a specified number of training steps. Stabilizes training, helps escape poor initializations, reduces early overfitting.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    #metric_for_best_model="accuracy",   # default is evaluation loss
    #greater_is_better=True,             # default is False
    report_to=["tensorboard", "wandb"],  # Send logs to Weights & Biases and /runs folder for TensorBoard
)

training_args_by_epoch

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

In [None]:
# Define a new model
model_by_epoch = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, problem_type="single_label_classification")  # 3 intents

# Update the trainer
from transformers import Trainer

trainer = Trainer(
    model_by_epoch,
    training_args_by_epoch,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<transformers.trainer.Trainer at 0x7fa3d164c6d0>

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3717,0.261615,0.991667,0.991656
2,0.0248,0.027871,0.991667,0.991673
3,0.0138,0.020459,0.991667,0.991673


[34m[1mwandb[0m: Adding directory to artifact (./results_4/checkpoint-75)... Done. 15.1s
[34m[1mwandb[0m: Adding directory to artifact (./results_4/checkpoint-150)... Done. 22.3s
[34m[1mwandb[0m: Adding directory to artifact (./results_4/checkpoint-225)... Done. 18.9s


TrainOutput(global_step=225, training_loss=0.2892832057343589, metrics={'train_runtime': 112.8586, 'train_samples_per_second': 15.949, 'train_steps_per_second': 1.994, 'total_flos': 7062294134592.0, 'train_loss': 0.2892832057343589, 'epoch': 3.0})

In [None]:
!ls -lh results_4

total 16K
drwxr-xr-x 2 root root 4.0K Jul 15 23:40 checkpoint-150
drwxr-xr-x 2 root root 4.0K Jul 15 23:41 checkpoint-225
drwxr-xr-x 2 root root 4.0K Jul 15 23:39 checkpoint-75
drwxr-xr-x 3 root root 4.0K Jul 15 23:39 runs


In [None]:
!ls -lh results_4/runs

total 4.0K
drwxr-xr-x 2 root root 4.0K Jul 15 23:39 Jul15_23-39-49_80c0173f2f14


In [None]:
trainer.save_model("best_model_4")
!ls best_model_4

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [None]:
# View TensorBoard
%tensorboard --logdir results_4/runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

### Display all runs in TensorBoard together

In [None]:
!mkdir all_runs
!cp -r results_1/runs/* all_runs
!cp -r results_2/runs/* all_runs
!cp -r results_3/runs/* all_runs
!cp -r results_4/runs/* all_runs
!ls all_runs

Jul15_23-30-43_80c0173f2f14  Jul15_23-37-32_80c0173f2f14
Jul15_23-35-33_80c0173f2f14  Jul15_23-39-49_80c0173f2f14


In [None]:
# View TensorBoard
%tensorboard --logdir all_runs
"""removed output from this cell, tensorboard makes notebook too big for github and it doesn't display in github anyway"""

In [None]:
# Compress the best model files
!zip -r best_model.zip best_model_3
!ls

  adding: best_model_3/ (stored 0%)
  adding: best_model_3/training_args.bin (deflated 51%)
  adding: best_model_3/tokenizer_config.json (deflated 75%)
  adding: best_model_3/tokenizer.json (deflated 71%)
  adding: best_model_3/model.safetensors (deflated 8%)
  adding: best_model_3/config.json (deflated 48%)
  adding: best_model_3/vocab.txt (deflated 53%)
  adding: best_model_3/special_tokens_map.json (deflated 42%)
all_runs      best_model_2  best_model_4    results_1  results_3  sample_data
best_model_1  best_model_3  best_model.zip  results_2  results_4  wandb


In [None]:
# Save the model locally to my machine
from google.colab import files

files.download("best_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>