In [1]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
# Load model
tokenizer = AutoTokenizer.from_pretrained("Rifky/Indobert-QA")
model = AutoModelForQuestionAnswering.from_pretrained("Rifky/Indobert-QA")


# Load dataset
with open('data.json', 'r') as f:
    data = json.load(f)

In [4]:
def create_qa_pairs(data):
    qa_pairs = []
    for item in data:
        context = item['judul'] + " " + " ".join(item['keywords']) + f" (ID: {item['infografis_id']})"
        # Question about the event
        questions = [
            f"Apa yang terjadi pada tahun {item['tahun']}?",
            f"Ada berapa infografis yang berkaitan dengan{item['judul']}?",
            f"Infografis mana saja yang membahas tentang {item['keywords'][0]} di Jepara?",
            f"Topik apa saja yang dibahas selain {item['keywords'][0]} dalam infografis dengan ID {item['id']}?",
            f"Apa yang menjadi fokus utama infografis dengan ID {item['infografis_id']}?",
            f"Apakah ada infografis lain yang berkaitan dengan topik yang sama dengan infografis ID {item['infografis_id']}?"
            f"Carikan infografis tentang {item['keywords']}"
        ]
        answers = [context] * len(questions) # Create a list of answers, one for each question

        # Add each question-answer pair separately to qa_pairs
        for question, answer in zip(questions, answers):
            qa_pairs.append({'question': question, 'context': context, 'answer': answer})
    return qa_pairs

qa_pairs = create_qa_pairs(data)

In [5]:
class QADataset(Dataset):
    def __init__(self, encodings, answers):  # Add answers to the constructor
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        import torch
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add answer information to the returned dictionary
        item['start_positions'] = torch.tensor(self.answers[idx]['start_positions'])
        item['end_positions'] = torch.tensor(self.answers[idx]['end_positions'])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

def find_answer_start_end(context, answer):
    start_char = context.find(answer)
    if start_char == -1:  # Answer not found in context
        return 0, 0  # Handle cases where answer is not found
    end_char = start_char + len(answer) - 1
    return start_char, end_char

def add_token_positions(encodings, answers, contexts):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_char, end_char = find_answer_start_end(contexts[i], answers[i])

        # Convert char positions to token positions
        start_token = encodings.char_to_token(i, start_char)
        end_token = encodings.char_to_token(i, end_char)

        # Handle cases where answer is not found within the context
        if start_token is None:
            start_token = 0  # CLS token
        if end_token is None:
            end_token = 0  # CLS token

        start_positions.append(start_token)
        end_positions.append(end_token)

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Split data into train and evaluation sets (adjust split ratio as needed)
split_ratio = 0.8  # Use 80% for training, 20% for evaluation
split_index = int(len(qa_pairs) * split_ratio)
train_qa_pairs = qa_pairs[:split_index]
eval_qa_pairs = qa_pairs[split_index:]

# Tokenize the train and evaluation datasets separately
train_encodings = tokenizer(
    [pair['question'] for pair in train_qa_pairs],
    [pair['context'] for pair in train_qa_pairs],
    truncation=True,
    padding='max_length',
    max_length=384,
    return_tensors='pt'
)
eval_encodings = tokenizer(
    [pair['question'] for pair in eval_qa_pairs],
    [pair['context'] for pair in eval_qa_pairs],
    truncation=True,
    padding='max_length',
    max_length=384,
    return_tensors='pt'
)

# Add token positions for train and evaluation datasets
add_token_positions(train_encodings, [pair['answer'] for pair in train_qa_pairs], [pair['context'] for pair in train_qa_pairs])
add_token_positions(eval_encodings, [pair['answer'] for pair in eval_qa_pairs], [pair['context'] for pair in eval_qa_pairs])

# Create train and evaluation datasets
train_dataset = QADataset(train_encodings, [{'start_positions': train_encodings['start_positions'][i], 'end_positions': train_encodings['end_positions'][i]} for i in range(len(train_encodings['start_positions']))])
eval_dataset = QADataset(eval_encodings, [{'start_positions': eval_encodings['start_positions'][i], 'end_positions': eval_encodings['end_positions'][i]} for i in range(len(eval_encodings['start_positions']))])

In [5]:
from torch.utils.data import Dataset

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    dataloader_num_workers=3,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, # Use the dataset objects
    eval_dataset=eval_dataset,   # Use the dataset objects
)

# Train the model
trainer.train()



  0%|          | 0/159 [00:00<?, ?it/s]