In [6]:
import numpy as np
import torch as nn
import pandas as pd
from sklearn.feature_extraction import text
from datasets import load_dataset, DatasetDict

# Load the dataset    
train_test_dataset = (load_dataset("sms_spam", split = 'train')).train_test_split(test_size=0.3)
test_valid_dataset = train_test_dataset['test'].train_test_split(test_size=0.6)

# Split the dataset into train, test and validation sets
dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})
print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 3901
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1004
    })
    valid: Dataset({
        features: ['sms', 'label'],
        num_rows: 669
    })
})
{'sms': 'if you text on your way to cup stop that should work. And that should be BUS\n', 'label': 0}


In [7]:
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["sms"], truncation=True, padding=True)

# Tokenize all the examples
tokenized_dataset = dataset.map(preprocess_function, batched=True)

print(tokenized_dataset)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.
Map:   0%|          | 0/3901 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 3901/3901 [00:00<00:00, 18352.39 examples/s]
Map: 100%|██████████| 1004/1004 [00:00<00:00, 19374.74 examples/s]
Map: 100%|██████████| 669/669 [00:00<00:00, 25704.82 examples/s]

DatasetDict({
    train: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3901
    })
    test: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1004
    })
    valid: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 669
    })
})





In [8]:
import evaluate

model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,              
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    evaluation_strategy="epoch",    
)

# Define the trainer
trainer = Trainer(
    model=model,                   
    args=training_args,                 
    train_dataset=tokenized_dataset['train'],      
    eval_dataset=tokenized_dataset['valid'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics           
)

# Train the model
trainer.train()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

{'eval_loss': 0.12883734703063965, 'eval_accuracy': 0.9656203288490284, 'eval_runtime': 5.5965, 'eval_samples_per_second': 119.54, 'eval_steps_per_second': 7.505, 'epoch': 1.0}


                                                 
100%|██████████| 488/488 [06:42<00:00,  1.21it/s]

{'eval_loss': 0.0907294973731041, 'eval_accuracy': 0.9865470852017937, 'eval_runtime': 5.3485, 'eval_samples_per_second': 125.081, 'eval_steps_per_second': 7.853, 'epoch': 2.0}
{'train_runtime': 402.3661, 'train_samples_per_second': 19.39, 'train_steps_per_second': 1.213, 'train_loss': 0.18576306202372567, 'epoch': 2.0}





TrainOutput(global_step=488, training_loss=0.18576306202372567, metrics={'train_runtime': 402.3661, 'train_samples_per_second': 19.39, 'train_steps_per_second': 1.213, 'train_loss': 0.18576306202372567, 'epoch': 2.0})

In [9]:
# Evaluate the model on the validation set
trainer.evaluate(tokenized_dataset['valid'])

# Evaluate the model on the test set
trainer.evaluate(tokenized_dataset['test'])

100%|██████████| 42/42 [00:05<00:00,  8.14it/s]
100%|██████████| 63/63 [00:12<00:00,  5.04it/s]


{'eval_loss': 0.14651179313659668,
 'eval_accuracy': 0.9701195219123506,
 'eval_runtime': 12.7718,
 'eval_samples_per_second': 78.611,
 'eval_steps_per_second': 4.933,
 'epoch': 2.0}