In [2]:
import en_ner_bc5cdr_md

In [3]:
import data_utils
from data_utils import load_dataset_assertion_classification


train, val, test = load_dataset_assertion_classification(frac=0.99,train_size=0.8,test_size=0.1)

TypeError: issubclass() arg 1 must be a class

In [2]:
y_train,X_train = train
y_valid,X_valid = val
y_test,X_test = test

In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

print("Encoding Labels .....")
encoder = LabelEncoder()
encoder.fit(y_train)
y_train_encode = np.asarray(encoder.transform(y_train))
y_valid_encode = np.asarray(encoder.transform(y_valid))
y_test_encode = np.asarray(encoder.transform(y_test))

Encoding Labels .....


In [4]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict


train_df = pd.DataFrame(X_train,columns=['sentence'])
valid_df = pd.DataFrame(X_valid,columns=['sentence'])
test_df = pd.DataFrame(X_test,columns=['sentence'])

train_df['label'] = y_train_encode.tolist()
valid_df['label'] = y_valid_encode.tolist()
test_df['label'] = y_test_encode.tolist()

print(train_df.head())

ds = DatasetDict ({
 'train': Dataset.from_pandas(train_df),
 'validation': Dataset.from_pandas(valid_df),
 'test': Dataset.from_pandas(test_df)
})

  from .autonotebook import tqdm as notebook_tqdm


                                            sentence  label
0  she was found to be [entity] febrile [entity] ...      2
1  left main coronary artery was narrowed 50 60 l...      2
2  discharge disposition expired discharge diagno...      2
3  brief hospital course a p 70 y o male with h o...      2
4                     [entity] hypertension [entity]      2


In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4327
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1082
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 602
    })
})

In [6]:
import torch 
# setting device on GPU if available, else CPU
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

## Full and head fine tuning

In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModel 
tokenizer_clinical_bio  = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT",model_max_length=150)
model_clinical = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT", 
                                                                    num_labels=3,id2label={0: 'PRESENT', 1: 'ABSENT', 2:'POSSIBLE'})

Some weights of the model checkpoint at emilyalsentzer/Bio_Discharge_Summary_BERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from

In [8]:
model_clinical = model_clinical.to(device)

In [9]:
special_tokens_dict = {"additional_special_tokens": ["[entity]"]}
num_added_toks = tokenizer_clinical_bio.add_special_tokens(special_tokens_dict,False)

print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model_clinical.resize_token_embeddings(len(tokenizer_clinical_bio))

We have added 1 tokens


Embedding(28997, 768)

In [10]:
def tokenize_function(example):
    return tokenizer_clinical_bio(example["sentence"],   padding="max_length", truncation=True)

In [11]:
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds = tokenized_ds.remove_columns(["sentence"])
#tokenized_ds = tokenized_ds.remove_columns(["__index_level_0__"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/4327 [00:00<?, ? examples/s]Map: 100%|██████████| 4327/4327 [00:00<00:00, 11759.87 examples/s]
Map: 100%|██████████| 1082/1082 [00:00<00:00, 17197.52 examples/s]
Map: 100%|██████████| 602/602 [00:00<00:00, 18337.82 examples/s]


In [12]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4327
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1082
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 602
    })
})

In [13]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="clinbert_trainer", evaluation_strategy="epoch", learning_rate=1e-5, num_train_epochs=1,)

trainer = Trainer(
    model=model_clinical,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['validation'],
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 4327
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 541
  Number of trainable parameters = 108313347
 92%|█████████▏| 500/541 [10:02<00:46,  1.14s/it]Saving model checkpoint to clinbert_trainer/checkpoint-500
Configuration saved in clinbert_trainer/checkpoint-500/config.json


{'loss': 0.4864, 'learning_rate': 7.578558225508319e-07, 'epoch': 0.92}


Model weights saved in clinbert_trainer/checkpoint-500/pytorch_model.bin
100%|██████████| 541/541 [10:50<00:00,  1.14s/it]***** Running Evaluation *****
  Num examples = 1082
  Batch size = 8
                                                 
100%|██████████| 541/541 [11:31<00:00,  1.14s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 541/541 [11:31<00:00,  1.28s/it]

{'eval_loss': 0.37423527240753174, 'eval_accuracy': 0.88909426987061, 'eval_runtime': 41.1912, 'eval_samples_per_second': 26.268, 'eval_steps_per_second': 3.302, 'epoch': 1.0}
{'train_runtime': 691.5262, 'train_samples_per_second': 6.257, 'train_steps_per_second': 0.782, 'train_loss': 0.47430734934075264, 'epoch': 1.0}





TrainOutput(global_step=541, training_loss=0.47430734934075264, metrics={'train_runtime': 691.5262, 'train_samples_per_second': 6.257, 'train_steps_per_second': 0.782, 'train_loss': 0.47430734934075264, 'epoch': 1.0})

In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1082
  Batch size = 8
100%|██████████| 136/136 [00:40<00:00,  3.39it/s]


{'eval_loss': 0.37423527240753174,
 'eval_accuracy': 0.88909426987061,
 'eval_runtime': 40.4263,
 'eval_samples_per_second': 26.765,
 'eval_steps_per_second': 3.364,
 'epoch': 1.0}

## Adapter

In [17]:
from transformers import BertConfig, AutoAdapterModel,AutoTokenizer

tokenizer_clinical_bio  = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT",model_max_length=150)
model_clinical_adapter = AutoAdapterModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT", 
                                                                    num_labels=3,id2label={0: 'PRESENT', 1: 'ABSENT', 2:'POSSIBLE'})


model_clinical_adapter.add_adapter("i2b2-ast")
model_clinical_adapter.train_adapter("i2b2-ast")

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /Users/mayerantoine/.cache/huggingface/hub/models--emilyalsentzer--Bio_Discharge_Summary_BERT/snapshots/affde836a50e4d333f15dae9270f5a856d59540b/config.json
Model config BertConfig {
  "_name_or_path": "emilyalsentzer/Bio_Discharge_Summary_BERT",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/mayerantoine/.cache/huggingface/hub/models--emilyal

In [18]:
model_clinical_adapter.add_classification_head("i2b2-ast", num_labels=3)
model_clinical_adapter.set_active_adapters("i2b2-ast")

Adding head 'i2b2-ast' with config {'head_type': 'classification', 'num_labels': 3, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}, 'use_pooler': False, 'bias': True}.


In [19]:
model_clinical_adapter = model_clinical_adapter.to(device)

In [20]:
def tokenize_function(example):
    return tokenizer_clinical_bio(example["sentence"],   padding="max_length", truncation=True)

In [21]:
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds = tokenized_ds.remove_columns(["sentence"])
#tokenized_ds = tokenized_ds.remove_columns(["__index_level_0__"])
tokenized_ds.set_format("torch")

Map:   0%|          | 0/4327 [00:00<?, ? examples/s]Map: 100%|██████████| 4327/4327 [00:00<00:00, 16621.79 examples/s]
Map: 100%|██████████| 1082/1082 [00:00<00:00, 17967.52 examples/s]
Map: 100%|██████████| 602/602 [00:00<00:00, 15993.28 examples/s]


In [22]:
special_tokens_dict = {"additional_special_tokens": ["[entity]"]}
num_added_toks = tokenizer_clinical_bio.add_special_tokens(special_tokens_dict,False)

print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model_clinical_adapter.resize_token_embeddings(len(tokenizer_clinical_bio))

Assigning ['[entity]'] to the additional_special_tokens key of the tokenizer


We have added 1 tokens


Embedding(28997, 768)

In [23]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

adaptraining_args = TrainingArguments(
    learning_rate=1e-4,
    evaluation_strategy="epoch",
    num_train_epochs=1,
    output_dir="./clinical_bert_adapter",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
   # remove_unused_columns=False,
)


adaptrainer = AdapterTrainer(
    model=model_clinical_adapter,
    args=adaptraining_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
adaptrainer.train()

***** Running training *****
  Num examples = 4327
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 541
  Number of trainable parameters = 23757123
 92%|█████████▏| 500/541 [07:28<00:37,  1.10it/s]Saving model checkpoint to ./clinical_bert_adapter/checkpoint-500
Configuration saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/adapter_config.json
Module weights saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/pytorch_adapter.bin
Configuration saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/head_config.json
Module weights saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/pytorch_model_head.bin
Configuration saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/head_config.json
Module weights saved in ./clinical_bert_adapter/checkpoint-500/i2b2-ast/pytorch_model_head.bin


{'loss': 0.4497, 'learning_rate': 7.578558225508318e-06, 'epoch': 0.92}


100%|██████████| 541/541 [08:06<00:00,  1.16it/s]***** Running Evaluation *****
  Num examples = 1082
  Batch size = 8
                                                 
100%|██████████| 541/541 [08:50<00:00,  1.16it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 541/541 [08:50<00:00,  1.02it/s]

{'eval_loss': 0.3326496183872223, 'eval_accuracy': 0.8937153419593346, 'eval_runtime': 44.2612, 'eval_samples_per_second': 24.446, 'eval_steps_per_second': 3.073, 'epoch': 1.0}
{'train_runtime': 530.3413, 'train_samples_per_second': 8.159, 'train_steps_per_second': 1.02, 'train_loss': 0.4394985858261475, 'epoch': 1.0}





TrainOutput(global_step=541, training_loss=0.4394985858261475, metrics={'train_runtime': 530.3413, 'train_samples_per_second': 8.159, 'train_steps_per_second': 1.02, 'train_loss': 0.4394985858261475, 'epoch': 1.0})

In [25]:
model_clinical_adapter.save_all_adapters("./clinical_bert_adapter")

Configuration saved in ./clinical_bert_adapter/i2b2-ast/adapter_config.json
Module weights saved in ./clinical_bert_adapter/i2b2-ast/pytorch_adapter.bin
Configuration saved in ./clinical_bert_adapter/i2b2-ast/head_config.json
Module weights saved in ./clinical_bert_adapter/i2b2-ast/pytorch_model_head.bin
