# Imports

In [1]:
!unzip src.zip

Archive:  src.zip
   creating: src/
  inflating: __MACOSX/._src          
  inflating: src/run.py              
  inflating: __MACOSX/src/._run.py   
  inflating: src/.DS_Store           
  inflating: __MACOSX/src/._.DS_Store  
  inflating: src/config.py           
  inflating: __MACOSX/src/._config.py  
   creating: src/fetchers/
  inflating: __MACOSX/src/._fetchers  
  inflating: src/fetch_data.py       
  inflating: __MACOSX/src/._fetch_data.py  
  inflating: src/__init__.py         
  inflating: __MACOSX/src/.___init__.py  
   creating: src/__pycache__/
  inflating: __MACOSX/src/.___pycache__  
  inflating: src/visualization.py    
  inflating: src/dataset_builder.py  
  inflating: __MACOSX/src/._dataset_builder.py  
  inflating: src/data_prep.py        
  inflating: __MACOSX/src/._data_prep.py  
   creating: src/deployment/
  inflating: __MACOSX/src/._deployment  
  inflating: src/data_splitter.py    
  inflating: __MACOSX/src/._data_splitter.py  
  inflating: src/model_building.p

In [None]:
import sys
sys.path.append('src') # colab testing
# sys.path.append('../src') # local testing

import pandas as pd
from transformers import Trainer
from copy import deepcopy


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pip install Bio

Collecting Bio
  Downloading bio-1.8.0-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.0-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.1/321.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.

In [None]:
from deployment import Integrator
from deployment import Preprocessor
from deployment import train_model

# Data processing


In [5]:
raw_file_path = "amyloid-raw-13-08-2025.csv"
raw_df = pd.read_csv(raw_file_path)
raw_df

Unnamed: 0,PMID,Rejection?,If so; reason to reject?,Other? Expand,No access to full-text,Decided by what?,URL,Year,Year.1,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,39441361,Rejected,(Pre)Clinical trials. No interaction or amyloi...,,,,https://pubmed.ncbi.nlm.nih.gov/39441361/,2025,,,,Open questions:,,,,,,,
1,39438925,Rejected,Not enough experimental data,,,read whole paper,https://pubmed.ncbi.nlm.nih.gov/39438925/,,,,,1) How do we consider an antibody used to dete...,,Not enough experimental data. Same for isolati...,,,,,
2,39438516,Rejected,"The interactor is not an Ab, Unknown antibody ...",,,,https://pubmed.ncbi.nlm.nih.gov/39438516/,,,,,2) What about Pre-prints?,,No to pre-prints,,,,,
3,39434125,Rejected,There are no interactions described,,,,https://pubmed.ncbi.nlm.nih.gov/39434125/,,,,,3) Non-English papers? I would trust automatic...,,No to Non-Englishs,,,,,
4,39432414,Rejected,Review article,,,Only abstract,https://pubmed.ncbi.nlm.nih.gov/39432414/,,,,,4) There are doubles. Excluding them?,,Remove them,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4018,25501811,Rejected,Not enough experimental data,,,,https://pubmed.ncbi.nlm.nih.gov/25501811/,,,25501811,FALSE,,,,,,,,
4019,25023329,Rejected,Not enough experimental data,,,Abstract and scanning through paper,https://pubmed.ncbi.nlm.nih.gov/25023329/,,,25023329,FALSE,,,,,,,,
4020,23904325,Useful,,,,Abstract and scanning through paper,https://pubmed.ncbi.nlm.nih.gov/23904325/,,,23904325,FALSE,,,,,,,,
4021,22344635,Rejected,The interactor is not an amyloid protein,,,Only abstract,https://pubmed.ncbi.nlm.nih.gov/22344635/,,,22344635,FALSE,,,,,,,,


In [6]:
integrator = Integrator(raw_path = raw_file_path, pmid_col = "PMID", email = "test@gmail.com")
# email is used for Entrez API and is not necessary

integrator.reduce_columns(keep_columns = ['PMID', "Rejection?", "If so; reason to reject?"])
integrator.fetch_pubmed()
integrator.merge()



Fetching PubMed data: 100%|██████████| 21/21 [00:42<00:00,  2.01s/batch]


In [7]:
merged = integrator.merged_df.copy()
preprocessor = Preprocessor(merged)

preprocessor.dropna(subset=["Abstract"])
preprocessor.drop_values(column="If so; reason to reject?", value="Review article")
preprocessor.map_labels(label_col="Rejection?", mapping={"Rejected": 0, "Useful": 1})
preprocessor.split_dataset(label_col="Rejection?")

Dropped 10 rows containing NaN. Dropped rows are stored in self.dropped_df.
Dropped 831 rows (condition: If so; reason to reject? == Review article). Dropped rows are stored in self.dropped_df.
Mapped labels in column 'Rejection?' using provided mapping. Value counts: {0: 1721, -1: 1311, 1: 150}. Unmapped (set to -1): 1311 rows.
Dataset split completed: 1871 rows in self.train_df, 1311 rows in self.test_df. Data stored in these attributes.


In [8]:
train = preprocessor.train_df
train.to_csv("train.csv", index=False)
test = preprocessor.test_df
test.to_csv("test.csv", index=False)

# Training

In [9]:
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

DEFAULT_CONFIG = {
    "seed": 1,
    "device": "cuda",  # "cuda" or "cpu"
    "wandb_project_name": "Amyloid-test",
    "data": {
        "train_file_path": "train.csv",
        "test_file_path": "test.csv",
        "target": "Rejection?",
        "test_size": 0.15,
        "model_name": model_name,
        "max_length": 380,
        "keywords": ["aggregates", "amyloid", "scfv", 'hiapp', 'mab', 'ttr', 'donanemab', 'aggregation'],
        "special_tokens": ["[KEY]", "[J_END]", "[T_END]"],
        "seed": 1,
    },
    "model": {
        "model_name": model_name,
        "num_labels": 2,
        "special_tokens": ['[KEY]', '[J_END]', '[T_END]'],
        "unfreeze_last_k_layers": 12,
        "change_classifier": False
    },
    "training": {
        "hparams": {
            "learning_rate": 3e-5,
            "num_train_epochs": 1,
            "per_device_train_batch_size": 16,
            "weight_decay": 0.1,
            "classifier_hidden_dim": 512,
            "warmup_ratio": 0.1
        }
    },
}

In [None]:
import wandb
wandb.login(key="xx")


In [11]:
artifacts, hf_test = train_model(DEFAULT_CONFIG)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

Map:   0%|          | 0/281 [00:00<?, ? examples/s]

Map:   0%|          | 0/1311 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4497,0.353892,0.843416,0.322034,0.826087,0.463415


0,1
best_train_accuracy,▁
best_train_f1,▁
best_train_loss,▁
best_train_model_preparation_time,▁
best_train_precision,▁
best_train_recall,▁
best_train_runtime,▁
best_train_samples_per_second,▁
best_train_steps_per_second,▁
best_val_accuracy,▁

0,1
best_train_accuracy,0.8478
best_train_f1,0.47619
best_train_loss,0.33097
best_train_model_preparation_time,0.0025
best_train_precision,0.32836
best_train_recall,0.86614
best_train_runtime,7.4359
best_train_samples_per_second,213.827
best_train_steps_per_second,13.448
best_val_accuracy,0.84342


# Prediction

In [12]:
trainer = artifacts["trainer"]
eval_args = deepcopy(trainer.args)
eval_args.report_to = []
eval_args.eval_strategy = "no"

eval_trainer = Trainer(
    model=trainer.model,
    args=eval_args,
    compute_metrics=trainer.compute_metrics,
)

hf_test.set_format(type="torch", columns=["input_ids", "attention_mask"])
predictions_output = eval_trainer.predict(hf_test['test'])
predicted_labels = predictions_output.predictions.argmax(axis=1)

hf_test.reset_format()
pmids = hf_test['test']["PMID"]
results_df = pd.DataFrame({
    "PMID": pmids,
    "predicted_label": predicted_labels,
})


In [16]:
print(results_df['predicted_label'].value_counts())
results_df


predicted_label
0    1161
1     150
Name: count, dtype: int64


Unnamed: 0,PMID,predicted_label
0,38512666,0
1,37913984,1
2,37783559,0
3,37783554,0
4,36723107,0
...,...,...
1306,19647749,0
1307,19638957,1
1308,19636575,0
1309,19636413,0


In [15]:
positives = results_df[results_df['predicted_label']==1]
positives

Unnamed: 0,PMID,predicted_label
1,37913984,1
6,35133388,1
8,33167834,1
11,29298867,1
24,28220836,1
...,...,...
1291,19763886,1
1294,19749432,1
1297,19741145,1
1307,19638957,1


# Save

In [17]:
results_df.to_csv("results.csv", index=False) # save results
positives.to_csv("positives.csv", index=False)

