# **News Topic Classifier Using BERT**

In [12]:
pip install transformers datasets torch scikit-learn streamlit




In [13]:
from datasets import load_dataset
dataset = load_dataset("HuyAugie/Smaller_AG_News_Dataset")

In [14]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5900
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4000
    })
})


In [15]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")


In [16]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5900
    })
    test: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4000
    })
})


In [17]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1
    }


In [8]:
from transformers import Trainer, TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.283441,0.90975,0.90941
2,0.350500,0.283843,0.9185,0.918531


TrainOutput(global_step=738, training_loss=0.27723411880534515, metrics={'train_runtime': 321.0206, 'train_samples_per_second': 36.758, 'train_steps_per_second': 2.299, 'total_flos': 776191551283200.0, 'train_loss': 0.27723411880534515, 'epoch': 2.0})

In [9]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.28344130516052246, 'eval_accuracy': 0.90975, 'eval_f1_score': 0.9094103268248525, 'eval_runtime': 26.8412, 'eval_samples_per_second': 149.025, 'eval_steps_per_second': 9.314, 'epoch': 2.0}


In [10]:
model.save_pretrained("news_bert_model")
tokenizer.save_pretrained("news_bert_model")

('news_bert_model/tokenizer_config.json',
 'news_bert_model/special_tokens_map.json',
 'news_bert_model/vocab.txt',
 'news_bert_model/added_tokens.json')

In [11]:
import streamlit as st
import torch
from transformers import BertTokenizer, BertForSequenceClassification

labels = ["World", "Sports", "Business", "Sci/Tech"]

tokenizer = BertTokenizer.from_pretrained("news_bert_model")
model = BertForSequenceClassification.from_pretrained("news_bert_model")
model.eval()

st.title("ðŸ“° News Topic Classifier (BERT)")

text = st.text_area("Enter News Headline")

if st.button("Predict"):
    if text.strip() == "":
        st.warning("Please enter text")
    else:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True
        )
        with torch.no_grad():
            outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
        st.success(f"Predicted Category: **{labels[prediction]}**")


2026-01-04 14:42:11.877 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2026-01-04 14:42:11.887 Session state does not function when running a script without `streamlit run`
