In [68]:
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, DistilBertConfig, EarlyStoppingCallback
from datasets import load_dataset
import os
import torch

In [69]:
# Load the classifier with ready-to-use model
classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [70]:
text = "Apple unveils new products at its annual technology conference."
result = classifier(text)
print(result)

[{'label': 'POSITIVE', 'score': 0.9976915121078491}]


In [71]:
dataset = load_dataset("ag_news")

In [72]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [73]:
# Print the first 5 samples
for i in range(5):
    print(dataset["train"][i])

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}
{'text': 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', 'label': 2}
{'text': "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", 'label': 2}
{'text': 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.', 'lab

In [74]:
# Asign label names to the labels for the first 5 samples in the training set
for i in range(5):
    print(dataset["train"][i]["label"], dataset["train"].features["label"].int2str(dataset["train"][i]["label"]))

2 Business
2 Business
2 Business
2 Business
2 Business


In [75]:
# Print label names
print(dataset["train"].features["label"])

ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)


In [76]:
# Print length of the training set and the test set
print(len(dataset["train"]), len(dataset["test"]))

120000 7600


In [77]:
# Adjust loaded data to the format required by the distilbert model

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [15]:
# Set the amount of categories for the classification
num_labels = 4

# Load the model for classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="/content/output",               # Folder for the results
    eval_strategy="epoch",          # Evaluate after each epoch
    save_strategy="epoch",          # Save after each epoch
    save_steps=500,                 # Save every 500 steps
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=16,        # Size of the batch for training
    per_device_eval_batch_size=8,         # Size of the batch for evaluation
    num_train_epochs=3,                   # Number of epochs
    weight_decay=0.01,              # Regularization parameter
    load_best_model_at_end=True,   # Load best model at the end
    save_total_limit=2,   # Set limit to 2 to save max amount of checkpoints
)

In [25]:
trainer = Trainer(
    model=model,                           # DistilBert model
    args=training_args,                    # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["test"]     # Evaluation dataset
)

In [26]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.2031,0.177845
2,0.1349,0.179587
3,0.086,0.210289


TrainOutput(global_step=22500, training_loss=0.15498081732855903, metrics={'train_runtime': 4482.511, 'train_samples_per_second': 80.312, 'train_steps_per_second': 5.02, 'total_flos': 4.768996442112e+16, 'train_loss': 0.15498081732855903, 'epoch': 3.0})

In [27]:
results = trainer.evaluate()

# Print results
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.17784465849399567, 'eval_runtime': 33.2247, 'eval_samples_per_second': 228.746, 'eval_steps_per_second': 28.593, 'epoch': 3.0}


In [32]:
print(dataset['train'].features['label'].names)

['World', 'Sports', 'Business', 'Sci/Tech']


In [33]:
print(model.config)

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "vocab_size": 30522
}



In [34]:
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}

num_labels = len(id2label)
print(num_labels)

4


In [36]:
config = DistilBertConfig.from_pretrained("distilbert-base-uncased",
                                          id2label=id2label,
                                          label2id=label2id)

In [37]:
model_v2 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
from torch.utils.data import DataLoader

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

In [45]:
training_args = TrainingArguments(
    output_dir="/content/output/v2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    save_strategy="epoch",
)



In [46]:
trainer = Trainer(
    model=model_v2,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2013,0.183875
2,0.1362,0.185619
3,0.0859,0.213689


TrainOutput(global_step=22500, training_loss=0.1543848668416341, metrics={'train_runtime': 4455.957, 'train_samples_per_second': 80.791, 'train_steps_per_second': 5.049, 'total_flos': 4.768996442112e+16, 'train_loss': 0.1543848668416341, 'epoch': 3.0})

In [51]:
training_args_v3 = TrainingArguments(
    output_dir="/content/output/v3",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    save_strategy="epoch",
)



In [52]:
model_v3 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
trainer_v3 = Trainer(
    model=model_v3,
    args=training_args_v3,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [56]:
print(len(dataset["train"]), len(dataset["test"]))

120000 7600


In [83]:
# Adjust loaded data to the format required by the distilbert model

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [84]:
model_v4 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
training_args_v4 = TrainingArguments(
    output_dir="/content/output/v4",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_steps=500,
    save_total_limit=2,
    disable_tqdm=False,
    save_strategy="epoch",
)



In [86]:
trainer_v4 = Trainer(
    model=model_v4,
    args=training_args_v4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [87]:
trainer_v4.train()

Epoch,Training Loss,Validation Loss
1,0.1963,0.184391
2,0.1593,0.176199
3,0.1228,0.178125


TrainOutput(global_step=11250, training_loss=0.1769073432074653, metrics={'train_runtime': 4203.7054, 'train_samples_per_second': 85.639, 'train_steps_per_second': 2.676, 'total_flos': 4.768996442112e+16, 'train_loss': 0.1769073432074653, 'epoch': 3.0})

In [89]:
from transformers import AutoModelForSequenceClassification

best_model = AutoModelForSequenceClassification.from_pretrained("/content/output/v4/checkpoint-11250")

In [90]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

print("Test results:", test_results)

Test results: {'eval_loss': 0.1838751882314682, 'eval_runtime': 31.8407, 'eval_samples_per_second': 238.688, 'eval_steps_per_second': 14.918, 'epoch': 3.0}


In [92]:
output_dir = "/content/output/final_model"

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('/content/output/final_model/tokenizer_config.json',
 '/content/output/final_model/special_tokens_map.json',
 '/content/output/final_model/vocab.txt',
 '/content/output/final_model/added_tokens.json')

In [93]:
!zip -r /content/output.zip /content/output/

  adding: content/output/ (stored 0%)
  adding: content/output/final_model/ (stored 0%)
  adding: content/output/final_model/config.json (deflated 50%)
  adding: content/output/final_model/model.safetensors (deflated 8%)
  adding: content/output/final_model/special_tokens_map.json (deflated 42%)
  adding: content/output/final_model/tokenizer_config.json (deflated 75%)
  adding: content/output/final_model/vocab.txt (deflated 53%)
  adding: content/output/checkpoint-22500/ (stored 0%)
  adding: content/output/checkpoint-22500/trainer_state.json (deflated 80%)
  adding: content/output/checkpoint-22500/config.json (deflated 50%)
  adding: content/output/checkpoint-22500/optimizer.pt (deflated 16%)
  adding: content/output/checkpoint-22500/training_args.bin (deflated 51%)
  adding: content/output/checkpoint-22500/model.safetensors (deflated 8%)
  adding: content/output/checkpoint-22500/scheduler.pt (deflated 56%)
  adding: content/output/checkpoint-22500/rng_state.pth (deflated 25%)
  addin

In [None]:
from google.colab import files
files.download("/content/output.zip")

In [97]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_v4.to(device)

inputs = tokenizer("""My favourite sport is football. I love playing football as well as watching it. I think this is a great sport, because it is a team game, but there also is a place for great individualities in it. The players must get on well with one another and understand one another very well. They have to be a real team to achieve a success. They should not be egoists, craving for personal success, but they should all work for the victory. When, for example, somebody is near to the opposing team's goal and he should pass the ball to another player, but instead of that he shoots and misses, this is not a good example of "team spirit".""",
                    return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model_v4(**inputs)
    logits = outputs.logits

predicted_class = torch.argmax(logits, dim=-1).item()
print("Predicted class:", predicted_class)

Predicted class: 1


In [98]:
id2label = model_v4.config.id2label

predicted_class_name = id2label[predicted_class]

print("Predicted class name:", predicted_class_name)

Predicted class name: Sports
