In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import load_from_disk, load_dataset
import wandb
import pandas as pd
import os
import yaml
import argparse
from transformers.trainer_callback import EarlyStoppingCallback
import numpy as np
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from datasets import Dataset, DatasetDict
from src.utils import Config, prepare_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_type = "vet_48a"

# Import yaml file
with open("../configs/train_default.yaml") as f:
    args = yaml.safe_load(f)
# Update default args with chosen config
if config_type != "default":
    with open("../configs/train_configs.yaml") as f:
        yaml_configs = yaml.safe_load_all(f)
        yaml_args = next(conf for conf in yaml_configs if conf["config"] == config_type)
    args.update(yaml_args)
    print(f"Updating with:\n{yaml_args}\n")
print(f"\n{args}\n")

output_dir = os.path.join(args["output_root"], "testing")
# Dataset
di = Config("../configs/dataset_info.yaml")
dataset = load_dataset(
    args["ds_name"],
    download_mode="force_redownload",
)
dataset = prepare_text(
    dataset=dataset,
    di=di,
    version=args["version"],
)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    f"james-burton/{config_type}",
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(
    args.get("tokenizer_base", args["model_base"])
)


def encode(examples):
    return {
        "label": np.array([examples["labels"]]),
        **tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=args["max_length"],
        ),
    }


dataset = dataset.map(encode)  # , load_from_cache_file=True)
dataset = dataset.remove_columns(["labels"])

# Make output directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save args file
with open(os.path.join(output_dir, "args.yaml"), "w") as f:
    yaml.dump(args, f)

# Initialise training arguments and trainer
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=args["num_epochs"],
    per_device_train_batch_size=args["batch_size"],
    per_device_eval_batch_size=args["batch_size"],
    logging_steps=args["logging_steps"],
    # learning_rate=args["lr"],
    # weight_decay=args["weight_decay"],
    # gradient_accumulation_steps=args["grad_accumulation_steps"],
    # warmup_ratio=args["warmup_ratio"],
    # lr_scheduler_type=args["lr_scheduler"],
    dataloader_num_workers=args["num_workers"],
    do_train=args["do_train"],
    do_predict=args["do_predict"],
    resume_from_checkpoint=args["resume_from_checkpoint"],
    # report_to="wandb" if not args["fast_dev_run"] else "none",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=args["save_total_limit"],
    load_best_model_at_end=True,
    seed=args["seed"],
    torch_compile=args["pytorch2.0"],  # Needs to be true if PyTorch 2.0
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    callbacks=[EarlyStoppingCallback(args["early_stopping_patience"])]
    if args["early_stopping_patience"] > 0
    else [],
)

# # Train model
# if args["do_train"]:
#     print("Training...")
#     trainer.train()
#     if not args["fast_dev_run"]:
#         model.push_to_hub(config_type, private=True)
#     print("Training complete")

# Predict on the test set
if args["do_predict"]:
    print("***** Running Prediction *****")
    # Test the model
    results = trainer.evaluate(dataset["test"], metric_key_prefix="test")
    preds = trainer.predict(dataset["test"]).predictions
    labels = [lab[0] for lab in dataset["test"]["label"]]
    results["test/accuracy"] = np.mean(np.argmax(preds, axis=1) == labels)
    results["test/precision"] = precision_score(
        labels,
        np.argmax(preds, axis=1),
        labels=np.arange(2),  # num_labels
        zero_division=0,
    )
    results["test/recall"] = recall_score(
        labels,
        np.argmax(preds, axis=1),
        labels=np.arange(2),  # num_labels
        zero_division=0,
    )
    results["test/roc_auc"] = roc_auc_score(labels, preds[:, 1])
    results["test/f1"] = (
        2
        * results["test/precision"]
        * results["test/recall"]
        / (results["test/precision"] + results["test/recall"])
    )

    # # Save the predictions
    # with open(os.path.join(output_dir, "test_results.txt"), "w") as f:
    #     f.write(str(results))
    # if not args["fast_dev_run"]:
    #     wandb.log(results)

print("Predictions complete")

Updating with:
{'config': 'vet_48a', 'fast_dev_run': False, 'batch_size': 32, 'ds_name': 'james-burton/vet_month_1_all_text', 'version': 'record_only', 'model_base': 'SAVSNET/PetBERT', 'tokenizer_base': 'dmis-lab/biobert-v1.1', 'warmup_steps': 500, 'weight_decay': 0.01}


{'config': 'vet_48a', 'fast_dev_run': False, 'do_train': True, 'do_predict': True, 'tags': ['bert', '1 month'], 'batch_size': 32, 'model_base': 'SAVSNET/PetBERT', 'output_root': 'models/', 'num_epochs': 50, 'early_stopping_patience': 3, 'grad_accumulation_steps': 1, 'seed': 42, 'logging_steps': 10, 'lr_scheduler': 'linear', 'warmup_ratio': 0, 'device': 'cuda', 'num_workers': 1, 'resume_from_checkpoint': False, 'predict_batch_size': 16, 'save_total_limit': 1, 'pytorch2.0': True, 'max_length': 512, 'ds_name': 'james-burton/vet_month_1_all_text', 'version': 'record_only', 'tokenizer_base': 'dmis-lab/biobert-v1.1', 'warmup_steps': 500, 'weight_decay': 0.01}



Downloading readme: 100%|██████████| 1.95k/1.95k [00:00<00:00, 3.53MB/s]
Downloading data: 100%|██████████| 747k/747k [00:00<00:00, 2.52MB/s]
Downloading data: 100%|██████████| 2.42M/2.42M [00:00<00:00, 12.6MB/s]
Downloading data: 100%|██████████| 447k/447k [00:00<00:00, 2.94MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  4.60it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 3287.93it/s]
Generating test split: 100%|██████████| 2184/2184 [00:00<00:00, 256348.57 examples/s]
Generating train split: 100%|██████████| 7206/7206 [00:00<00:00, 458977.91 examples/s]
Generating validation split: 100%|██████████| 1272/1272 [00:00<00:00, 296479.84 examples/s]
Downloading (…)lve/main/config.json: 100%|██████████| 725/725 [00:00<00:00, 1.08MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:10<00:00, 40.9MB/s] 
Map: 100%|██████████| 2184/2184 [00:01<00:00, 1726.44 examples/s]
Map: 100%|██████████| 7206/7206 [00:04<00:00, 1778.54 examples/s]
Map: 100%|

***** Running Prediction *****


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjameswburton18[0m. Use [1m`wandb login --relogin`[0m to force relogin


early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predictions complete


In [3]:
results

{'test_loss': 0.6556823253631592,
 'test_runtime': 29.7353,
 'test_samples_per_second': 73.448,
 'test_steps_per_second': 2.32,
 'test/accuracy': 0.6016483516483516,
 'test/precision': 0.5671883432271991,
 'test/recall': 0.9392314566577301,
 'test/roc_auc': 0.7030153515672527,
 'test/f1': 0.7072678331090175}

In [4]:
import torch

sean_results = {}
sean_preds = [
    int(torch.sigmoid(torch.tensor(probs))[1].item() >= 0.5) for probs in preds
]
sean_results["test/accuracy"] = np.mean([sp == l for sp, l in zip(sean_preds, labels)])
sean_results["test/precision"] = precision_score(
    labels,
    sean_preds,
    labels=np.arange(2),  # num_labels
    zero_division=0,
)
sean_results["test/recall"] = recall_score(
    labels,
    sean_preds,
    labels=np.arange(2),  # num_labels
    zero_division=0,
)
# sean_results["test/roc_auc"] = roc_auc_score(labels, preds[:, 1])
sean_results["test/f1"] = (
    2
    * sean_results["test/precision"]
    * sean_results["test/recall"]
    / (sean_results["test/precision"] + sean_results["test/recall"])
)

In [5]:
results

{'test_loss': 0.6556823253631592,
 'test_runtime': 29.7353,
 'test_samples_per_second': 73.448,
 'test_steps_per_second': 2.32,
 'test/accuracy': 0.6016483516483516,
 'test/precision': 0.5671883432271991,
 'test/recall': 0.9392314566577301,
 'test/roc_auc': 0.7030153515672527,
 'test/f1': 0.7072678331090175}

In [6]:
sean_results

{'test/accuracy': 0.6034798534798534,
 'test/precision': 0.5681940700808625,
 'test/recall': 0.9419124218051832,
 'test/f1': 0.7088096839273706}

In [13]:
len(dataset["train"]["label"])

7206

In [22]:
7206 | 32

7206

In [32]:
import numpy as np
import random

# Assuming your list is called `label_list`
label_list = dataset["train"]["label"][: 32 * 225]

# Reshape the list into groups of 32
label_array = np.array(label_list)
random.shuffle(label_array)
label_array = label_array.reshape(-1, 32)

# Calculate the average of each group
averages = np.mean(label_array, axis=1)

# Print the averages
print(averages)

[0.3125  0.3125  0.34375 0.46875 0.46875 0.5625  0.53125 0.59375 0.34375
 0.5625  0.6875  0.34375 0.40625 0.40625 0.28125 0.5625  0.5     0.5
 0.5     0.375   0.53125 0.5625  0.5625  0.625   0.59375 0.5     0.4375
 0.59375 0.625   0.375   0.59375 0.375   0.5625  0.40625 0.59375 0.4375
 0.53125 0.34375 0.625   0.46875 0.46875 0.5625  0.40625 0.53125 0.40625
 0.40625 0.53125 0.4375  0.65625 0.625   0.59375 0.46875 0.59375 0.5
 0.53125 0.375   0.5625  0.5     0.5625  0.59375 0.5625  0.28125 0.59375
 0.625   0.46875 0.53125 0.40625 0.5     0.53125 0.59375 0.3125  0.625
 0.65625 0.59375 0.5     0.40625 0.5625  0.5     0.4375  0.59375 0.59375
 0.46875 0.5625  0.65625 0.46875 0.40625 0.65625 0.46875 0.625   0.5625
 0.59375 0.59375 0.46875 0.59375 0.53125 0.53125 0.46875 0.59375 0.4375
 0.5     0.59375 0.53125 0.5625  0.53125 0.375   0.65625 0.375   0.5625
 0.46875 0.59375 0.375   0.375   0.5     0.75    0.34375 0.5     0.5625
 0.40625 0.625   0.40625 0.53125 0.53125 0.5625  0.59375 0.4375  0.

In [29]:
7206 / 32

225.1875

In [8]:
[torch.sigmoid(torch.tensor(probs))[1].item() >= 0.5 for probs in preds]

[True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,


In [9]:
np.mean([torch.sigmoid(torch.tensor(probs))[1].item() >= 0.5 for probs in preds])

0.8493589743589743

In [10]:
dataset["train"]["text"]

['"Hx - mucky R eye for last couple of days. No known trauma, no nasal discharge, no coughing/sneezing lately although prone to this. On PE ++purulent discharge and chemosis. Fl stain negative, no FB. Treat for conjuntivitis. INI in 7 days r/v."',
 '"ongoing mild hyperpnoea, hr 140 but tachycardic runs susp of atrial fibrillation. heart now sounds clear (no muffling of heart sounds) and no adventitious lung sounds. bright, alert, reactive and Defaecating, uninating, drinking, eating normally. abdo nad, mucus membranes pink and crt 2s.  advise initially increase nelio to try to improve contractility - may require further treatment to stabilise heart rhythm. warned prognosis guarded. Next appointment in 1 week."',
 'minor pododermatitis and some pad abrasions, discuss possible allergens but also very likely from exuberant exercise. O has paw balm and I have prescribe apoquel.',
 ". Owner reports noticing some patchiness on <<identifier>>'s ears in the last week.not seen scratching but ha

In [11]:
args["version"]

'record_only'