In [2]:
import json
from transformers import AutoTokenizer, AutoModelForMultipleChoice, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
import warnings
import logging

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    if "transformers" in logger.name.lower():
        logger.setLevel(logging.ERROR)

# 載入資料
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def add_labels_to_data(data):
    for entry in data:
        relevant_para = entry['relevant']
        paragraphs = entry['paragraphs']
        entry['label'] = paragraphs.index(relevant_para)
    return data

paragraphs = load_data('./context.json')
train_data = load_data('./train.json')
val_data = load_data('./valid.json')

In [3]:
def prepare_data_for_mc(data, paragraphs):
    mc_data = []
    for entry in data:
        question = entry['question']
        relevant_para = entry['relevant']

        # 將每個段落（作為選項）轉換為字符串，並確保有4個選項
        options = [str(paragraphs[para_id]) for para_id in entry['paragraphs']]

        mc_data.append({
            'sent1': "",  # 空字符串
            'sent2': question,  # 問題
            'ending0': options[0],  # 第一个段落
            'ending1': options[1],  # 第二个段落
            'ending2': options[2],  # 第三个段落
            'ending3': options[3],  # 第四个段落
            'label': entry['paragraphs'].index(relevant_para)  # 標註正確選項
        })
    return mc_data


# 準備 train 和 validation 資料
train_mc_data = prepare_data_for_mc(train_data, paragraphs)
val_mc_data = prepare_data_for_mc(val_data, paragraphs)

In [4]:
train_dataset = Dataset.from_pandas(pd.DataFrame(train_mc_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_mc_data))

In [5]:
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = AutoModelForMultipleChoice.from_pretrained('hfl/chinese-roberta-wwm-ext')

tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [6]:
ending_names = ["ending0", "ending1", "ending2", "ending3"]

def preprocess_function(examples):
    first_sentences = [[context] * 4 for context in examples["sent1"]]
    question_headers = examples["sent2"]
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    ]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, padding='max_length', max_length=512)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [7]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/21714 [00:00<?, ? examples/s]

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"exact_match": accuracy.compute(predictions=predictions, references=labels)["accuracy"]}

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",    # 改成每步評估
    save_strategy="steps",
    save_steps=0.2,
    eval_steps=0.2,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
    save_total_limit=1,
    disable_tqdm=False,
    save_safetensors=False,
    fp16=True  # 啟用混合精度訓練
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    # data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Exact Match
543,0.0553,0.207884,0.952476
1086,0.0178,0.170598,0.959787
1629,0.0789,0.175465,0.957793
2172,0.0409,0.191806,0.958126


TrainOutput(global_step=2714, training_loss=0.04615435533516532, metrics={'train_runtime': 2727.3127, 'train_samples_per_second': 15.923, 'train_steps_per_second': 0.995, 'total_flos': 4.569461292930662e+16, 'train_loss': 0.04615435533516532, 'epoch': 1.9992633517495397})

In [10]:
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.contiguous()

model.save_pretrained("./finetuned_ps")
tokenizer.save_pretrained("./finetuned_ps")

('./finetuned_ps/tokenizer_config.json',
 './finetuned_ps/special_tokens_map.json',
 './finetuned_ps/vocab.txt',
 './finetuned_ps/added_tokens.json',
 './finetuned_ps/tokenizer.json')