

# **TASK 2 - SQUAD INTERACTIVE MODEL**
---

# **IMPORT DEPENDENCIES AND CHECKING**

In [1]:
# ------------------------------------------------------------------
# BAGIAN 1: INSTALL & SETUP
# ------------------------------------------------------------------
!pip install transformers datasets evaluate accelerate -q

import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM, # Perhatikan: Seq2SeqLM (bukan SequenceClassification)
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate

# Cek GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Config
MODEL_CHECKPOINT = "t5-base"
BATCH_SIZE = 8 # Sesuaikan VRAM (T5-base agak berat, kalau OOM turunin jadi 4)
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 32 # Panjang jawaban maksimal
SAMPLE_SIZE = 2000 # Ambil sebagian biar cepat trainingnya (Hapus kalau mau full)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


# **LOAD DATASET**


In [2]:
# ------------------------------------------------------------------
# BAGIAN 2: LOAD DATASET (SQuAD)
# ------------------------------------------------------------------
print("\n--- Loading SQuAD Dataset ---")
dataset = load_dataset("squad")

# Ambil subset agar cepat (Demo)
train_dataset = dataset["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(SAMPLE_SIZE // 5))

print(f"Train samples: {len(train_dataset)}")
print(f"Contoh Data Asli:\n{train_dataset[0]}")



--- Loading SQuAD Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Train samples: 2000
Contoh Data Asli:
{'id': '573173d8497a881900248f0c', 'title': 'Egypt', 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.', 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?', 'answers': {'text': ['84%'], 'answer_start': [468]}}


# **TOKENISASI DAN PREPROCESSING**

In [3]:
# ------------------------------------------------------------------
# BAGIAN 3: PREPROCESSING (FORMAT KHUSUS T5)
# ------------------------------------------------------------------
# T5 butuh prefix. Format input harus: "question: ... context: ..."
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    inputs = []
    targets = []

    for i in range(len(examples["question"])):
        # Format Input T5: "question: Q context: C"
        question = examples["question"][i]
        context = examples["context"][i]
        input_text = f"question: {question} context: {context}"

        # Format Target (Jawaban)
        # SQuAD punya jawaban di dalam list 'text', ambil yang pertama
        answer = examples["answers"][i]["text"][0]

        inputs.append(input_text)
        targets.append(answer)

    # Tokenisasi Input
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenisasi Target (Jawaban)
    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("\n--- Tokenizing ---")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

# Hapus kolom lama
cols_to_remove = dataset["train"].column_names
tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_eval = tokenized_eval.remove_columns(cols_to_remove)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]


--- Tokenizing ---


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

# **LOAD PRE-TRAINED MODEL**

In [4]:
# ------------------------------------------------------------------
# BAGIAN 4: LOAD MODEL (Seq2Seq)
# ------------------------------------------------------------------
print("\n--- Loading Model T5 ---")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
model.to(device)

# Collator khusus Seq2Seq (penting buat T5)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


--- Loading Model T5 ---


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# **TRAINING CONFIGURATION**

In [5]:
# ------------------------------------------------------------------
# BAGIAN 5: TRAINING CONFIG
# ------------------------------------------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuning-t5-qa", # Sesuaikan nama repo tugas
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True, # Wajib True buat generative model
    fp16=True, # Gunakan mixed precision biar hemat memori GPU
    logging_steps=50,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


  trainer = Seq2SeqTrainer(


# **TRAINING**

In [6]:
# ------------------------------------------------------------------
# BAGIAN 6: TRAINING
# ------------------------------------------------------------------
print("\n--- Starting Training ---")
trainer.train()

# Simpan Model
save_path = "./finetuning-t5-qa/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")


--- Starting Training ---


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.0622,0.072256
2,0.0535,0.068898
3,0.0437,0.069536


Model saved to ./finetuning-t5-qa/final_model


# **FINE TUNED INFERENCED METHOD**

In [7]:
# ------------------------------------------------------------------
# BAGIAN 7: INFERENCE (INTERACTIVE)
# ------------------------------------------------------------------
print("\n--- Testing Model Manual ---")

def ask_t5(question, context, model_path):
    # Load model yang sudah dilatih
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Format input ala T5
    input_text = f"question: {question} context: {context}"

    # Tokenisasi
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate Jawaban
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=32, # Batasi panjang jawaban
            num_beams=4,    # Beam search biar jawaban lebih bagus
            early_stopping=True
        )

    # Decode (Angka -> Teks)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer




--- Testing Model Manual ---
Context: 
Super Mario Bros. is a platform game developed and published by Nintendo. 
The successor to the 1983 arcade game Mario Bros., and the first in the Super Mario series, 
it was released in 1985 for the Famicom in Japan.

Question: Who developed Super Mario Bros?

Answer: Nintendo


# **TESTING MODEL**

In [9]:
# --- CONTOH TEST ---
my_context = """
Super Mario Bros. is a platform game developed and published by Nintendo.
The successor to the 1983 arcade game Mario Bros., and the first in the Super Mario series,
it was released in 1985 for the Famicom in Japan.
"""
my_question = "Who developed Super Mario Bros?"

print(f"Context: {my_context}")
print(f"Question: {my_question}")

predicted_answer = ask_t5(my_question, my_context, save_path)
print(f"\nAnswer: {predicted_answer}")

Context: 
Super Mario Bros. is a platform game developed and published by Nintendo. 
The successor to the 1983 arcade game Mario Bros., and the first in the Super Mario series, 
it was released in 1985 for the Famicom in Japan.

Question: Who developed Super Mario Bros?

Answer: Nintendo


In [11]:
my_question = "What time Super Mario Bros released?"
print(f"Question: {my_question}")
predicted_answer = ask_t5(my_question, my_context, save_path)
print(f"\nAnswer: {predicted_answer}")

Question: What time Super Mario Bros released?

Answer: 1985


# **SAVE MODEL**

In [8]:
save_path = "./finetuning-t5-qa/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Model saved to ./finetuning-t5-qa/final_model
