In [1]:
!pip install datasets
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [2]:
df = pd.read_csv("label_training.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,abstract,label,aging,note
0,0,0,Hydrogel Microneedles with Programmed Mesophas...,Microneedle-based drug delivery offers an attr...,medical research,non-aging research,This is medical research on drug delivery tech...
1,1,1,Primary intestinal lymphangiectasia with lymph...,"Primary intestinal lymphangiectasia (PIL), als...",medical research,non-aging research,This is a case report of a rare disease in a y...
2,2,2,Hermansky-Pudlak Syndrome Complicated by Crohn...,ABSTRACT Hermansky-Pudlak syndrome (HPS) is a ...,medical research,non-aging research,This is a case report on rare immune disorders...
3,3,3,Performance Comparison of Liquid Chromatograph...,Kinase inhibitors are small-molecule drugs des...,medical research,non-aging research,This study evaluates drug monitoring methods i...
4,4,4,Prophylaxis vs preemptive therapy in preventio...,"Abstract Cytomegalovirus (CMV), the beta-human...",medical research,non-aging research,This is medical research on CMV prevention in ...


In [3]:
df.loc[df['label'] == 'non-medical research', 'aging'] = 'non-aging research'

In [4]:
le_label = LabelEncoder()
le_aging = LabelEncoder()

In [5]:
df['label_encoded'] = le_label.fit_transform(df['label'])    # medical vs. non-medical
df['aging_encoded'] = le_aging.fit_transform(df['aging'])    # aging vs. non-aging

In [6]:
df['target'] = df['aging_encoded']

In [7]:
df = df.rename(columns={"abstract": "text"})
df['text'] = df['text'].astype(str)
df = df[df['text'].str.strip() != ""]
df = df.dropna(subset=['text', 'target'])
df = df.rename(columns={"target": "labels"})

In [8]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

In [9]:
from torch import nn
from transformers import BertPreTrainedModel, BertModel
import transformers
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # Added num_items_in_batch
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
    }

In [12]:
from sklearn.model_selection import train_test_split
from datasets import Dataset


from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from sklearn.metrics import classification_report
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X = df['text']
y = df['labels'].values

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)


all_reports = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\n🌀 Fold {fold + 1}")

    train_df = df.iloc[train_idx].reset_index(drop=True)
    test_df = df.iloc[test_idx].reset_index(drop=True)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Tokenize
    train_tokenized = train_dataset.map(tokenize, batched=True, remove_columns=['text'])
    test_tokenized = test_dataset.map(tokenize, batched=True, remove_columns=['text'])

    train_tokenized.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
    test_tokenized.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

    # Compute class weights for this fold
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['labels']), y=train_df['labels'])
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

    # Re-init model from scratch each fold
    model = AutoModelForSequenceClassification.from_pretrained(
        "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
        num_labels=2
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
        class_weights=class_weights_tensor.to("cuda" if torch.cuda.is_available() else "cpu")
    )

    trainer.train()
    preds = trainer.predict(test_tokenized)
    report = classification_report(test_tokenized['labels'], preds.predictions.argmax(axis=1), output_dict=True)
    all_reports.append(report)




🌀 Fold 1


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlauzyi29[0m ([33mlauzyi29-national-taiwan-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.055301,0.985,0.991525,0.997159,0.985955
2,No log,0.169025,0.9825,0.990155,0.991549,0.988764
3,0.248700,0.057679,0.985,0.991525,0.997159,0.985955
4,0.248700,0.052019,0.9925,0.995781,0.997183,0.994382



🌀 Fold 2


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.306323,0.9575,0.975749,0.991304,0.960674
2,No log,0.464081,0.965,0.980337,0.980337,0.980337
3,0.241200,0.322253,0.9625,0.978663,0.991354,0.966292
4,0.241200,0.320668,0.9725,0.984441,0.991453,0.977528



🌀 Fold 3


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.205523,0.98,0.988732,0.991525,0.985955
2,No log,0.162954,0.975,0.985876,0.991477,0.980337
3,0.268500,0.255478,0.97,0.983051,0.988636,0.977528
4,0.268500,0.242799,0.97,0.983051,0.988636,0.977528



🌀 Fold 4


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.173415,0.96,0.977077,0.997076,0.957865
2,No log,0.099141,0.9775,0.987234,0.997135,0.977528
3,0.253100,0.180844,0.9825,0.990127,0.994334,0.985955
4,0.253100,0.173447,0.98,0.988701,0.994318,0.983146



🌀 Fold 5


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.230108,0.974937,0.985876,0.988669,0.983099
2,No log,0.392211,0.969925,0.983193,0.977716,0.988732
3,0.225700,0.274846,0.974937,0.985836,0.991453,0.980282
4,0.225700,0.302045,0.974937,0.985915,0.985915,0.985915


In [13]:
# Evaluate
preds = trainer.predict(test_tokenized)
print(classification_report(test_tokenized['labels'], preds.predictions.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89        44
           1       0.99      0.98      0.99       355

    accuracy                           0.97       399
   macro avg       0.93      0.95      0.94       399
weighted avg       0.98      0.97      0.98       399



In [14]:
avg_f1 = np.mean([r['weighted avg']['f1-score'] for r in all_reports])
avg_precision = np.mean([r['weighted avg']['precision'] for r in all_reports])
avg_recall = np.mean([r['weighted avg']['recall'] for r in all_reports])
avg_accuracy = np.mean([r['accuracy'] for r in all_reports])

print(f"\n📊 Average Cross-Validated Metrics:")
print(f"F1 Score:     {avg_f1:.4f}")
print(f"Precision:    {avg_precision:.4f}")
print(f"Recall:       {avg_recall:.4f}")
print(f"Accuracy:     {avg_accuracy:.4f}")



📊 Average Cross-Validated Metrics:
F1 Score:     0.9762
Precision:    0.9778
Recall:       0.9755
Accuracy:     0.9755


In [15]:
# Save both model and tokenizer to a folder
model.save_pretrained("pubmedbert-aging-model")
tokenizer.save_pretrained("pubmedbert-aging-model")

('pubmedbert-aging-model/tokenizer_config.json',
 'pubmedbert-aging-model/special_tokens_map.json',
 'pubmedbert-aging-model/vocab.txt',
 'pubmedbert-aging-model/added_tokens.json',
 'pubmedbert-aging-model/tokenizer.json')

In [16]:
import shutil
from google.colab import files

# Zip the entire model directory
shutil.make_archive("pubmedbert-aging-model", 'zip', "pubmedbert-aging-model")

# Download the zipped file
files.download("pubmedbert-aging-model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>