In [70]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, set_seed
import torch, json, os

In [71]:
path = "./data/training13b.json"

In [72]:
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Trích thông tin từ từng câu hỏi
samples = []
for q in data["questions"]:
    q_body = q["body"].strip()
    q_type = q["type"].strip().lower()  # yesno / factoid / list / summary
    samples.append({"question": q_body, "type": q_type})

# Tạo DataFrame
# df = pd.DataFrame(samples)
# df.to_csv('./data/ques_type.csv')

In [73]:
print(os.getcwd())

d:\Github\BioASQ


In [74]:
dir = 'data'
for i in range(1,3):
    file = f'BioASQ-task13bPhaseB-testset{i}.json'
    path = os.path.join(dir, file)

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for q in data.get("questions", []):
        q_body = q.get("body", "").strip()
        q_type = q.get("type", "").strip().lower()

        # Chỉ lấy 4 loại hợp lệ
        if q_type in {"yesno", "factoid", "list", "summary"} and q_body:
            samples.append({"question": q_body, "type": q_type})


In [75]:
df = pd.DataFrame(samples)

In [83]:
df.to_csv('data/train.csv')

In [76]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["type"]) 

In [77]:
dataset = Dataset.from_pandas(df[["question", "label"]])

# 4. Train-test split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = dataset["train"]
val_ds = dataset["test"]




In [78]:
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract" # Hoặc thử 

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["question"], truncation=True)

In [79]:
train_ds = train_ds.map(tokenize)
val_ds = val_ds.map(tokenize)

# 6. Load model
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Map:   0%|          | 0/4447 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 4447/4447 [00:01<00:00, 4263.88 examples/s]
Map: 100%|██████████| 1112/1112 [00:00<00:00, 3454.51 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = model.to(device)



training_args = TrainingArguments(
    output_dir="./question_type_classifier",          # Nơi lưu model
    evaluation_strategy="epoch",                     # Đánh giá sau mỗi epoch
    save_strategy="epoch",                           # Lưu checkpoint sau mỗi epoch
    logging_strategy="steps",                        # Ghi log mỗi X step
    logging_steps=20,                                # Ghi log mỗi 20 bước
    learning_rate=1e-5,                              # Nhỏ hơn mặc định để tránh overshooting
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,                              # Tăng nhẹ nếu không overfit
    weight_decay=0.01,                               # Regularization
    load_best_model_at_end=True,                     # Tự động dùng model tốt nhất
    metric_for_best_model="accuracy",                # Chọn model theo accuracy
    save_total_limit=2,                              # Giới hạn số checkpoint
    lr_scheduler_type="linear",                      # LR giảm dần theo bước
    warmup_ratio=0.1,                                # Warmup giúp ổn định khi train
    report_to="none",                                # Không log ra wandb nếu không cần
    seed=42,                                         # Reproducible
    push_to_hub=False                                # Không upload lên HuggingFace Hub
)


def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, classification_report
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
from transformers import EarlyStoppingCallback


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [82]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.435,0.405452,0.875899
2,0.3387,0.344557,0.883993
3,0.2569,0.351102,0.889388
4,0.1921,0.411003,0.88759
5,0.2405,0.483383,0.881295


TrainOutput(global_step=1390, training_loss=0.3845905935163978, metrics={'train_runtime': 201.2753, 'train_samples_per_second': 176.753, 'train_steps_per_second': 11.05, 'total_flos': 266095360746384.0, 'train_loss': 0.3845905935163978, 'epoch': 5.0})

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
y_true = val_ds["label"]
y_pred = trainer.predict(val_ds).predictions.argmax(axis=-1)
ConfusionMatrixDisplay.from_predictions(y_true, y_pred, display_labels=le.classes_)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=le.classes_))
