In [25]:
import numpy as np
import torch as nn
import pandas as pd
from sklearn.feature_extraction import text
from datasets import load_dataset, DatasetDict

# Load the dataset    
train_test_dataset = (load_dataset("sms_spam", split = 'train')).train_test_split(test_size=0.3)
test_valid_dataset = train_test_dataset['test'].train_test_split(test_size=0.6)
dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})
print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 3901
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1004
    })
    valid: Dataset({
        features: ['sms', 'label'],
        num_rows: 669
    })
})
{'sms': 'You will recieve your tone within the next 24hrs. For Terms and conditions please see Channel U Teletext Pg 750\n', 'label': 1}


In [28]:
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples["sms"], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(tokenized_dataset)

Map: 100%|██████████| 3901/3901 [00:00<00:00, 19020.56 examples/s]
Map: 100%|██████████| 1004/1004 [00:00<00:00, 18337.19 examples/s]
Map: 100%|██████████| 669/669 [00:00<00:00, 5177.69 examples/s]

DatasetDict({
    train: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3901
    })
    test: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1004
    })
    valid: Dataset({
        features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 669
    })
})





In [29]:
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels = 2)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    evaluation_strategy="epoch",    
)

trainer = Trainer(
    model=model,                   
    args=training_args,                 
    train_dataset=tokenized_dataset['train'],      
    eval_dataset=tokenized_dataset['valid'],
    tokenizer=tokenizer           
)

trainer.train()
trainer.evaluate()
trainer.evaluate(tokenized_dataset['test'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/1115 [2:29:49<?, ?it/s]
  2%|▏         | 26/1220 [07:09<4:00:43, 12.10s/it]