<a href="https://colab.research.google.com/github/jahanvi513/MAT496-LLM/blob/main/Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets -q

In [None]:
!pip install datasets --upgrade -q

In [None]:
!pip install evaluate -q

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from evaluate import load
from tqdm import tqdm

In [None]:
dataset = load_dataset("ag_news")
print("Sample:", dataset["train"][0])

Sample: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=data_collator)

#the current training is using a small subset of the actual dataset due to system constraints

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_training_steps = len(train_dataloader) * 10
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
model.train()
for epoch in range(10):
    print(f"\nEpoch {epoch + 1}")
    progress_bar = tqdm(train_dataloader)
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_description(f"Loss: {loss.item():.4f}")


Epoch 1


Loss: 0.4279: 100%|██████████| 63/63 [01:27<00:00,  1.39s/it]



Epoch 2


Loss: 0.1077: 100%|██████████| 63/63 [01:30<00:00,  1.44s/it]



Epoch 3


Loss: 0.0313: 100%|██████████| 63/63 [01:30<00:00,  1.44s/it]



Epoch 4


Loss: 0.0201: 100%|██████████| 63/63 [01:30<00:00,  1.44s/it]



Epoch 5


Loss: 0.0077: 100%|██████████| 63/63 [01:30<00:00,  1.44s/it]



Epoch 6


Loss: 0.0062: 100%|██████████| 63/63 [01:30<00:00,  1.43s/it]



Epoch 7


Loss: 0.0068: 100%|██████████| 63/63 [01:30<00:00,  1.43s/it]



Epoch 8


Loss: 0.0032: 100%|██████████| 63/63 [01:30<00:00,  1.44s/it]



Epoch 9


Loss: 0.0045: 100%|██████████| 63/63 [01:30<00:00,  1.43s/it]



Epoch 10


Loss: 0.0034: 100%|██████████| 63/63 [01:30<00:00,  1.43s/it]


In [None]:
torch.save(model.state_dict(), 'news-classification')

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
model.load_state_dict(torch.load('news-classification'))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
metric = load("accuracy")
model.eval()

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

final_score = metric.compute()
print("Evaluation Accuracy:", final_score["accuracy"])

Evaluation Accuracy: 0.83


In [None]:
def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()

    label_map = {
        0: "World",
        1: "Sports",
        2: "Business",
        3: "Sci/Tech"
    }

    return label_map[predicted_class]

In [None]:
example_1 = "IPL 2025 Updated Schedule: Top Foreign Players From RCB, MI, PBKS Who Will Miss Playoffs Due To National Duties - In Pics"
example_2 = "Stock market today: Trade setup for Nifty 50 to India-Pakistan news"
example_3 = "ICC Announce Venues For Women’s T20 World Cup 2026 In England"
example_4 = "U.S. Backstabs India, Shields Pakistan? JD Vance's Shock Message To Modi On Pahalgam Revenge"

print("Example 1:", classify_news(example_1))
print("Example 2:", classify_news(example_2))
print("Example 3:", classify_news(example_3))
print("Example 4:", classify_news(example_4))

Example 1: Sports
Example 2: Business
Example 3: Sports
Example 4: World
