# UAS DL TASK 1 (AGNews)

## SETUP

In [1]:
!pip install transformers datasets evaluate accelerate scikit-learn -U

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m133.2 MB/s[0m eta [36m0:00:00[

## Import

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset

MODEL_CKPT = "distilbert-base-uncased"
BATCH_SIZE = 64
EPOCHS = 3

print(f"GPU Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0)}")

GPU Available: True
Device Name: NVIDIA L4


## Load Data

In [3]:
# 1. LOAD DATASET
dataset = load_dataset("sh0416/ag_news")

# Kita cek dulu label uniknya apa saja
unique_labels = set(dataset['train']['label'])
print(f"Label Asli di Dataset: {unique_labels}")

# Fungsi untuk memaksa label jadi 0-3
def fix_labels(example):

    # Jika label sudah 0, 1, 2, 3 -> Biarkan
    if max(unique_labels) > 3:
        example['label'] = example['label'] - 1
    return example

# Terapkan perbaikan
dataset = dataset.map(fix_labels)
print(f"Label Setelah Diperbaiki: {set(dataset['train']['label'])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/33.7M [00:00<?, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Label Asli di Dataset: {1, 2, 3, 4}


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Label Setelah Diperbaiki: {0, 1, 2, 3}


## Tokenizer

In [4]:
MODEL_CKPT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def preprocess(examples):
    # Gabung Title + Description agar lebih akurat
    inputs = [f"{str(t)} {str(d)}" for t, d in zip(examples["title"], examples["description"])]
    return tokenizer(inputs, truncation=True, padding=True, max_length=512)

tokenized_ds = dataset.map(preprocess, batched=True)

# Verifikasi
print("Sukses!")
print("Shape Input IDs:", len(tokenized_ds['train'][0]['input_ids']))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Sukses!
Shape Input IDs: 342


## Model Setup & Training

In [5]:
# Mapping Label
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)

# Metric
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

# Training Config
args = TrainingArguments(
    output_dir="./ag_news_model",
    learning_rate=2e-5,
    per_device_train_batch_size=64, # L4 Optimized
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none", # Double check matikan wandb

    load_best_model_at_end=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting Training...")
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1924,0.178151,0.939605
2,0.1362,0.166062,0.945263
3,0.103,0.168312,0.948026


TrainOutput(global_step=5625, training_loss=0.16070423092312283, metrics={'train_runtime': 1816.8466, 'train_samples_per_second': 198.146, 'train_steps_per_second': 3.096, 'total_flos': 3.438184802319053e+16, 'train_loss': 0.16070423092312283, 'epoch': 3.0})

## Save & Test

In [7]:
trainer.save_model("./final_agnews")
print("Model AG News berhasil disimpan.")

text = "Oil prices dropped significantly today as the stock market crashed."
inputs = tokenizer(text, return_tensors="pt").to("cuda")
logits = model(**inputs).logits
pred = torch.argmax(logits).item()
print(f"Prediksi: {id2label[pred]}")



Model AG News berhasil disimpan.
Prediksi: Business
