In [17]:
%pip install sentencepiece protobuf

Collecting protobuf
  Downloading protobuf-5.28.0-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Downloading protobuf-5.28.0-cp38-abi3-macosx_10_9_universal2.whl (414 kB)
Installing collected packages: protobuf
Successfully installed protobuf-5.28.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk("./article-titles.hf")
dataset



DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2434
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 215
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 215
    })
})

In [3]:
model_name = "albert/albert-base-v2"
your_path = 'classify-articles'

In [4]:
from collections import Counter

train_label_distribution = Counter(dataset['train']['label'])
test_label_distribution = Counter(dataset['test']['label'])

print("Training Label Distribution:", train_label_distribution)
print("Test Label Distribution:", test_label_distribution)

Training Label Distribution: Counter({'science': 497, 'sports': 493, 'economy': 489, 'politics': 480, 'technology': 475})
Test Label Distribution: Counter({'sports': 50, 'science': 47, 'politics': 47, 'technology': 36, 'economy': 35})


In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(dataset['train']['label'])

def encode_labels(example):
    return {'encoded_label': label_encoder.transform([example['label']])[0]}

for split in dataset:
    dataset[split] = dataset[split].map(encode_labels, batched=False)

In [6]:
from transformers import AutoConfig

unique_labels = sorted(list(set(dataset['train']['label'])))
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

config = AutoConfig.from_pretrained(model_name)
config.id2label = id2label
config.label2id = label2id

# Verify the correct labels
print("ID to Label Mapping:", config.id2label)
print("Label to ID Mapping:", config.label2id)

ID to Label Mapping: {0: 'economy', 1: 'politics', 2: 'science', 3: 'sports', 4: 'technology'}
Label to ID Mapping: {'economy': 0, 'politics': 1, 'science': 2, 'sports': 3, 'technology': 4}


In [7]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, config=config)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def filter_invalid_content(example):
    return isinstance(example['text'], str)

dataset = dataset.filter(filter_invalid_content, batched=False)

def encode_data(batch):
    tokenized_inputs = tokenizer(batch["text"], padding=True, truncation=True, max_length=256)
    tokenized_inputs["labels"] = batch["encoded_label"]
    return tokenized_inputs

dataset_encoded = dataset.map(encode_data, batched=True)
dataset_encoded

Filter:   0%|          | 0/2434 [00:00<?, ? examples/s]

Filter:   0%|          | 0/215 [00:00<?, ? examples/s]

Filter:   0%|          | 0/215 [00:00<?, ? examples/s]

Map:   0%|          | 0/2434 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

Map:   0%|          | 0/215 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2434
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 215
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 215
    })
})

In [9]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import numpy as np

label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)

def per_label_accuracy(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    correct_predictions = cm.diagonal()
    label_totals = cm.sum(axis=1)
    per_label_acc = np.divide(correct_predictions, label_totals, out=np.zeros_like(correct_predictions, dtype=float), where=label_totals != 0)
    return dict(zip(labels, per_label_acc))

In [12]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    decoded_labels = label_encoder.inverse_transform(labels)
    decoded_preds = label_encoder.inverse_transform(preds)

    precision = precision_score(decoded_labels, decoded_preds, average='weighted')
    recall = recall_score(decoded_labels, decoded_preds, average='weighted')
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')
    acc = accuracy_score(decoded_labels, decoded_preds)

    labels_list = list(label_encoder.classes_)
    per_label_acc = per_label_accuracy(decoded_labels, decoded_preds, labels_list)

    per_label_acc_metrics = {}
    for label, accuracy in per_label_acc.items():
        label_key = f"accuracy_label_{label}"
        per_label_acc_metrics[label_key] = accuracy

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        **per_label_acc_metrics
    }

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=your_path,
    num_train_epochs=3,
    warmup_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100,
    learning_rate=2e-5,
    save_steps=1000,
    gradient_accumulation_steps=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



  0%|          | 0/228 [00:00<?, ?it/s]

{'loss': 1.6592, 'grad_norm': 17.214561462402344, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.13}
{'loss': 1.6322, 'grad_norm': 12.780923843383789, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.26}
{'loss': 1.5926, 'grad_norm': 30.824169158935547, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.39}
{'loss': 1.5943, 'grad_norm': 17.90578842163086, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.52}
{'loss': 1.5476, 'grad_norm': 17.254802703857422, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.65}
{'loss': 1.5345, 'grad_norm': 23.29121971130371, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.78}
{'loss': 1.5421, 'grad_norm': 21.84389305114746, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.92}
{'loss': 1.4884, 'grad_norm': 24.690324783325195, 'learning_rate': 3.2000000000000003e-06, 'epoch': 1.05}
{'loss': 1.4151, 'grad_norm': 25.043909072875977, 'learning_rate': 3.6000000000000003e-06, 'epoch': 1.18}
{'loss': 1.3703, 'grad_norm': 25.61355209350586, '

  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 1.377465844154358, 'eval_accuracy': 0.4930232558139535, 'eval_f1': 0.42383332698769244, 'eval_precision': 0.6099528142905292, 'eval_recall': 0.4930232558139535, 'eval_accuracy_label_economy': 0.8, 'eval_accuracy_label_politics': 0.02127659574468085, 'eval_accuracy_label_science': 0.7021276595744681, 'eval_accuracy_label_sports': 0.72, 'eval_accuracy_label_technology': 0.2222222222222222, 'eval_runtime': 0.9308, 'eval_samples_per_second': 230.989, 'eval_steps_per_second': 15.041, 'epoch': 1.31}
{'loss': 1.3219, 'grad_norm': 25.733768463134766, 'learning_rate': 4.4e-06, 'epoch': 1.44}
{'loss': 1.2374, 'grad_norm': 19.24130630493164, 'learning_rate': 4.800000000000001e-06, 'epoch': 1.57}
{'loss': 1.1398, 'grad_norm': 20.945722579956055, 'learning_rate': 5.2e-06, 'epoch': 1.7}
{'loss': 1.049, 'grad_norm': 18.362749099731445, 'learning_rate': 5.600000000000001e-06, 'epoch': 1.83}
{'loss': 0.9089, 'grad_norm': 21.393177032470703, 'learning_rate': 6e-06, 'epoch': 1.96}
{'loss': 

  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.4494597017765045, 'eval_accuracy': 0.8976744186046511, 'eval_f1': 0.9004308622967181, 'eval_precision': 0.9133808113740947, 'eval_recall': 0.8976744186046511, 'eval_accuracy_label_economy': 0.9428571428571428, 'eval_accuracy_label_politics': 0.8936170212765957, 'eval_accuracy_label_science': 0.9148936170212766, 'eval_accuracy_label_sports': 0.96, 'eval_accuracy_label_technology': 0.75, 'eval_runtime': 0.5691, 'eval_samples_per_second': 377.764, 'eval_steps_per_second': 24.599, 'epoch': 2.61}
{'loss': 0.3908, 'grad_norm': 16.43819808959961, 'learning_rate': 8.400000000000001e-06, 'epoch': 2.75}
{'loss': 0.3735, 'grad_norm': 8.563356399536133, 'learning_rate': 8.8e-06, 'epoch': 2.88}
{'train_runtime': 56.2949, 'train_samples_per_second': 129.71, 'train_steps_per_second': 4.05, 'train_loss': 1.0989631905890347, 'epoch': 2.98}


TrainOutput(global_step=228, training_loss=1.0989631905890347, metrics={'train_runtime': 56.2949, 'train_samples_per_second': 129.71, 'train_steps_per_second': 4.05, 'total_flos': 9501472156644.0, 'train_loss': 1.0989631905890347, 'epoch': 2.980392156862745})

In [23]:
trainer.evaluate()


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_loss': 0.38190317153930664,
 'eval_accuracy': 0.9069767441860465,
 'eval_f1': 0.9060619059788495,
 'eval_precision': 0.9125925762408804,
 'eval_recall': 0.9069767441860465,
 'eval_accuracy_label_economy': 0.9428571428571428,
 'eval_accuracy_label_politics': 0.9574468085106383,
 'eval_accuracy_label_science': 0.9361702127659575,
 'eval_accuracy_label_sports': 0.96,
 'eval_accuracy_label_technology': 0.6944444444444444,
 'eval_runtime': 0.6159,
 'eval_samples_per_second': 349.062,
 'eval_steps_per_second': 22.73,
 'epoch': 2.980392156862745}

In [22]:
trainer.save_model(your_path)
trainer.save_state()

In [18]:

from transformers import pipeline
pipe = pipeline('text-classification', model='classify-articles')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [20]:
example_titles = [
    "Los Angeles area high school student dies after injury at football game",
    "Jannik Sinner bests American Taylor Fritz for U.S. Open men's title",
    "30 people injured in alleged drunk driving incident after man drives into building",
    "Steve Kornacki: How Harris and Trump are polling in states that could decide the election",
    "Manhunt underway for suspect in Kentucky mass shooting near highway",
    "Liz Cheney says it's 'not enough' for anti-Trump Republicans to vote for someone other than Harris",
    "AR-15 recovered near I-75 shooting scene in Kentucky as manhunt continues",
    "‘Beetlejuice Beetlejuice’ jolts box office with $110 million opening weekend",
    "Maya Rudolph nabs sixth Emmy and Angela Basset wins her first at Creative Arts Emmys",
    "Kendrick Lamar will headline Super Bowl LIX halftime show",
]

for title in example_titles:
    result = pipe(title)
    print(f"Title: {title}")
    print(f"Output: {result[0]['label']}")

Title: Los Angeles area high school student dies after injury at football game
Output: sports
Title: Jannik Sinner bests American Taylor Fritz for U.S. Open men's title
Output: sports
Title: 30 people injured in alleged drunk driving incident after man drives into building
Output: technology
Title: Steve Kornacki: How Harris and Trump are polling in states that could decide the election
Output: politics
Title: Manhunt underway for suspect in Kentucky mass shooting near highway
Output: science
Title: Liz Cheney says it's 'not enough' for anti-Trump Republicans to vote for someone other than Harris
Output: politics
Title: AR-15 recovered near I-75 shooting scene in Kentucky as manhunt continues
Output: sports
Title: ‘Beetlejuice Beetlejuice’ jolts box office with $110 million opening weekend
Output: technology
Title: Maya Rudolph nabs sixth Emmy and Angela Basset wins her first at Creative Arts Emmys
Output: sports
Title: Kendrick Lamar will headline Super Bowl LIX halftime show
Output: 

In [21]:
tokenizer.push_to_hub("jamesbaskerville/classify-article-titles")
trainer.push_to_hub("jamesbaskerville/classify-article-titles")

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jamesbaskerville/classify-articles/commit/0a830b49b5e61d9d6a591ed5e2994316da5fbd0c', commit_message='jamesbaskerville/classify-article-titles', commit_description='', oid='0a830b49b5e61d9d6a591ed5e2994316da5fbd0c', pr_url=None, pr_revision=None, pr_num=None)