In [1]:
import pandas as pd
import torch
from torch import cuda
import seqeval
from seqeval.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, Trainer, TrainingArguments
import matplotlib.pyplot as plt
import evaluate
import numpy as np
from datasets import load_dataset
import ast

In [2]:
# 60 train, 20 dev, 20 test
train_df = pd.read_csv('data/processed/phee/ace/train_w_test_tag_mapped.csv')
dev_df = pd.read_csv('data/processed/phee/ace/dev_w_test_tag_mapped.csv')
test_df = pd.read_csv('data/processed/phee/ace/test_w_test_tag_mapped.csv')

In [3]:
train_df = train_df[['Sentence', 'Med_Tag']]
train_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
dev_df = dev_df[['Sentence', 'Med_Tag']]
dev_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
test_df = test_df[['Sentence', 'Med_Tag']]
test_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)

train_df['sentence'] = train_df['sentence'].apply(lambda x: x.split())
train_df['tag'] = train_df['tag'].apply(lambda x: x.split())
dev_df['sentence'] = dev_df['sentence'].apply(lambda x: x.split())
dev_df['tag'] = dev_df['tag'].apply(lambda x: x.split())
test_df['sentence'] = test_df['sentence'].apply(lambda x: x.split())
test_df['tag'] = test_df['tag'].apply(lambda x: x.split())

# save to csv
train_df.to_csv('data/processed/phee/ace/final_train.csv', index=False)
dev_df.to_csv('data/processed/phee/ace/final_dev.csv', index=False)
test_df.to_csv('data/processed/phee/ace/final_test.csv', index=False)

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [5]:
dataset = load_dataset('csv', data_files={'train': 'data/processed/phee/ace/final_train.csv', 'validation': 'data/processed/phee/ace/final_dev.csv', 'test': 'data/processed/phee/ace/final_test.csv'})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
unique_tags = pd.unique([tag for sublist in train_df['tag'] for tag in sublist])
# Create label2id dictionary
label2id = {tag: id for id, tag in enumerate(unique_tags)}

# Create id2label dictionary by swapping keys with values
id2label = {id: tag for tag, id in label2id.items()}

# Print the dictionaries
print("label2id:", label2id)
print("id2label:", id2label)

label2id: {'O': 0, 'I-Treatment': 1, 'I-Test': 2, 'I-Problem': 3, 'I-Background': 4, 'I-Other': 5}
id2label: {0: 'O', 1: 'I-Treatment', 2: 'I-Test', 3: 'I-Problem', 4: 'I-Background', 5: 'I-Other'}


In [7]:
def transform(example_batch):
    example_batch['sentence'] = ast.literal_eval(example_batch['sentence'])
    example_batch['tag'] = ast.literal_eval(example_batch['tag'])
    example_batch['tag'] = [label2id[label] for label in example_batch['tag']]
    return example_batch

for type in ['train', 'validation', 'test']:
    dataset[type] = dataset[type].map(transform)

Map:   0%|          | 0/2793 [00:00<?, ? examples/s]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/968 [00:00<?, ? examples/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

In [9]:
inputs = tokenizer(dataset['train'][0]['sentence'], is_split_into_words=True)
inputs.tokens()
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 8,
 9,
 10,
 11,
 11,
 12,
 13,
 14,
 15,
 16,
 16,
 17,
 18,
 19,
 20,
 21,
 21,
 21,
 21,
 21,
 22,
 23,
 23,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 31,
 31,
 31,
 31,
 32,
 33,
 34,
 34,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 None]

In [10]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [11]:
labels =  dataset['train'][0]['tag']
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 2, 2, 2, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, -100]


In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tag"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset['train'].column_names,
)

Map:   0%|          | 0/2793 [00:00<?, ? examples/s]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/968 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets.column_names

{'train': ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
 'validation': ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
 'test': ['input_ids', 'token_type_ids', 'attention_mask', 'labels']}

In [16]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    1,    0,    0,    2,    2,    2,    2,    2,    0,    0,
            3,    3,    3,    0,    0,    0,    0,    0,    0, -100],
        [-100,    0,    2,    2,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    4,    4,    4,    4,    0,    0,    0,    1,    1,    1,
            1,    3,    3,    3,    0,    0,    0,    0,    0,    3,    3,    3,
            3,    3,    3,    0,    0,    0, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [17]:
metric = evaluate.load("seqeval")

In [18]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    report = classification_report(y_true=true_labels, y_pred=true_predictions)
    # save all_metrics to file
    with open("analysis/reports/all_metrics.txt", "w") as f:
        f.write(str(true_predictions))
        f.write(str(true_labels))
        f.write(str(all_metrics))
        # save report to file
        f.write(report)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [19]:
model = AutoModelForTokenClassification.from_pretrained(
    'samrawal/bert-base-uncased_clinical-ner',
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at samrawal/bert-base-uncased_clinical-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# from huggingface_hub import notebook_login

# notebook_login()

In [21]:
from transformers import TrainingArguments

# args = TrainingArguments(
#     "bert-finetuned-ner",
#     overwrite_output_dir=True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     num_train_epochs=5,
#     weight_decay=0.01,
# )

In [22]:


args = TrainingArguments(
    "bert-finetuned-ner",
    overwrite_output_dir=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    num_train_epochs=10,
    logging_strategy="epoch",
)

In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.72,0.468367,0.481237,0.545424,0.511324,0.841077
2,0.4008,0.405371,0.529126,0.593399,0.559423,0.858426
3,0.3424,0.401999,0.541692,0.614495,0.575801,0.859112
4,0.3026,0.376611,0.562386,0.628785,0.593735,0.868918
5,0.2673,0.383503,0.565112,0.63491,0.597981,0.869467
6,0.2458,0.386023,0.562725,0.637972,0.597991,0.869673
7,0.2251,0.395532,0.556808,0.638653,0.594929,0.869124
8,0.2136,0.397743,0.559715,0.641034,0.597621,0.869364
9,0.202,0.40197,0.562743,0.639333,0.598598,0.870564
10,0.1971,0.402319,0.563025,0.638312,0.59831,0.870393


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=880, training_loss=0.31166449893604625, metrics={'train_runtime': 290.8246, 'train_samples_per_second': 96.037, 'train_steps_per_second': 3.026, 'total_flos': 1004593751221392.0, 'train_loss': 0.31166449893604625, 'epoch': 10.0})

In [24]:
# trainer.push_to_hub(commit_message="Training complete")

In [25]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [26]:
model = AutoModelForTokenClassification.from_pretrained(
    'samrawal/bert-base-uncased_clinical-ner',
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at samrawal/bert-base-uncased_clinical-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [28]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [29]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [30]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

HTTPError: Invalid user token. If you didn't pass a user token, make sure you are properly logged in by executing `huggingface-cli login`, and if you did pass a user token, double-check it's correct.

In [None]:
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions