In [3]:
import pandas as pd
import torch
from torch import cuda
import seqeval
from seqeval.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [59]:
# 60 train, 20 dev, 20 test
train_df = pd.read_csv('data/processed/phee/ace/train_w_test_tag_mapped.csv')
dev_df = pd.read_csv('data/processed/phee/ace/dev_w_test_tag_mapped.csv')
test_df = pd.read_csv('data/processed/phee/ace/test_w_test_tag_mapped.csv')

In [60]:
train_df = train_df[['Sentence', 'Med_Tag']]
train_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
train_df

Unnamed: 0,sentence,tag
0,objective : to test the hypothesis that tumor ...,O O O O O O O O O O O O O O O O O O O O O O O ...
1,an evaluation of ovarian structure and functio...,O I-Test I-Test O O O O O O O O I-Background I...
2,phenobarbital hepatotoxicity in an 8 - month -...,I-Treatment I-Problem O I-Background I-Backgro...
3,the authors report a case of balint syndrome w...,O O O O O O I-Problem I-Problem I-Problem I-Pr...
4,"according to the naranjo probability scale , f...",O O O O I-Test I-Test O I-Treatment O O O O O ...
...,...,...
3001,l - t4 stimulated lymphocyte transformation in...,I-Treatment I-Treatment I-Treatment O I-Proble...
3002,a 53 - year - old man developed lower leg edem...,I-Background I-Background I-Background I-Backg...
3003,a mentally retarded 23 - year - old woman with...,I-Background I-Problem I-Problem I-Background ...
3004,"after 5 days of treatment with il - 2 , the pa...",I-Other I-Treatment I-Treatment I-Treatment I-...


In [61]:
dev_df = dev_df[['Sentence', 'Med_Tag']]
dev_df.rename(columns={"Sentence": "sentence", "Med_Tag": "tag"}, inplace=True)
dev_df

Unnamed: 0,sentence,tag
0,physicians should be aware that plp can occur ...,O O O O O I-Problem O O O O O I-Treatment O
1,heparin - dependent antibodies and thrombosis ...,O O O O O O O I-Treatment O O I-Problem O
2,treatment of lithium tremor with metoprolol .,O O I-Treatment I-Problem O I-Treatment O
3,"a 54 - year - old man , treated with amiodaron...",I-Background I-Background I-Background I-Backg...
4,gynaecomastia is a rarely reported adverse dru...,I-Problem O O O O O O O O O I-Treatment I-Trea...
...,...,...
998,successful challenge with clozapine in a histo...,O O O I-Treatment O O O O I-Problem O
999,case summary : a 57 - year - old female with c...,O O O I-Background I-Background I-Background I...
1000,acute intravascular hemolysis developed when a...,I-Problem I-Problem I-Problem O O I-Background...
1001,intravitreal triamcinolone may have had an inf...,I-Treatment I-Treatment O O O O O O O I-Proble...


In [7]:
label2id = {k: v for v, k in enumerate(train_df['tag'].str.split().explode().unique())}
id2label = {v: k for v, k in enumerate(train_df['tag'].str.split().explode().unique())}
label2id

{'O': 0,
 'I-Treatment': 1,
 'I-Test': 2,
 'I-Problem': 3,
 'I-Background': 4,
 'I-Other': 5}

In [8]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [51]:
tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at samrawal/bert-base-uncased_clinical-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [10]:
# padding='max_length', truncation=True => padding to max model input length
def process_sentence(sentence, tag):
    inputs = [tokenizer(sentence, add_special_tokens=True, padding='max_length', max_length=128, return_tensors="pt", return_attention_mask=True)]
    token_labels = []
    # tag = "O "+ tag + " O"  # Using 'O' as dummy labels for [CLS] and [SEP]
    for word, label in zip(sentence.split(), tag.split()):
        word_tokens = tokenizer.tokenize(word)
        # Assign the same label to all subwords of a word
        token_labels.extend([label2id[label]] * len(word_tokens))
    token_labels = [label2id['O']] + token_labels +  [label2id['O']] # Using 'O' as dummy labels for [CLS] and [SEP]
    # Pad labels to match the max length
    padding_length = 128 - len(token_labels)  # Adjusted for max length
    token_labels += [label2id['O']] * padding_length  # Using 'O' label for padding

    # Convert labels to PyTorch tensor
    token_labels = torch.tensor([token_labels])

    return inputs, token_labels

In [11]:
def tokenize(df):   
    input_list = []
    label_list = []
    for sentence, tag  in zip(df['sentence'], df['tag']):
        inputs, labels = process_sentence(sentence, tag)  
        input_list.append(inputs)
        label_list.append(labels)
    return input_list, label_list


In [16]:
results = accuracy_score([[0, 1, 2], [9, 2, 0]], [[0, 1, 2], [9, 2, 3]])
results

0.8333333333333334

In [25]:
inputs, labels = tokenize(train_df)

In [28]:
inputs[0][0]['input_ids'][0]

tensor([  101,  7863,  1024,  2000,  3231,  1996, 10744,  2008, 13656, 26785,
        29166,  5387,  1006, 28286,  2546,  1007,  1011,  6541,  2089,  2865,
         2618,  1996,  3279,  1998,  1996,  2139,  4305, 12494,  4765, 18963,
         1997,  4942, 12690, 17191,  6638,  8153,  1999,  1996, 22597,  1011,
        10572,  5423, 16503, 18981,  4048,  2229,  1997,  1037, 22939, 20915,
         2594,  5776,  2040,  3591,  4866, 22520,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [114]:
def evaluate(df, model, type):
    all_inputs, all_labels = tokenize(df)
    loss, accuracy = 0, 0
    nb_steps = 0
    predictions = []
    references = []
    for idx, (inputs, labels) in enumerate(zip(all_inputs, all_labels)):  # Replace with actual batch generation
        ids = inputs[0]['input_ids'].to(device)
        mask = inputs[0]['attention_mask'].to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)

        loss, logits = outputs.loss, outputs.logits
        preds = torch.argmax(logits, dim=2)
        
        predictions.append([id2label[p.item()] for pred in preds for p in pred])
        references.append([id2label[l.item()] for label in labels for l in label])

        loss += loss.item()

        nb_steps += 1
        # nb_tr_examples += targets.size(0)
        
        if idx > 0 and (idx % 1000 == 0):
            loss_step = loss/nb_steps
            print(f"{type} loss per 1000 training steps: {loss_step}")

    epoch_loss = loss / nb_steps
    
    accuracy = accuracy_score(y_pred=predictions, y_true=references)
    print(f"{type} loss: {epoch_loss}")
    print(f"{type} accuracy: {accuracy}")
   
    return predictions, references


In [52]:
optimizer = AdamW(model.parameters(), lr=1e-5)  # Replace with your optimizer



In [115]:
def train(all_inputs, all_labels):
    # Training loop
    model.train()  # Set the model to training mode
    for epoch in range(5):
        tr_loss, tr_accuracy = 0, 0
        nb_tr_steps = 0
        predictions = []
        references = []
        for idx, (inputs, labels) in enumerate(zip(all_inputs, all_labels)):  # Replace with actual batch generation
            ids = inputs[0]['input_ids'].to(device)
            mask = inputs[0]['attention_mask'].to(device)
            labels = labels.to(device)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)

            loss, tr_logits = outputs.loss, outputs.logits
            preds = torch.argmax(tr_logits, dim=2)
            
            predictions.append([id2label[p.item()] for pred in preds for p in pred])
            references.append([id2label[l.item()] for label in labels for l in label])
            
            
            # Convert predictions to numpy array
            # predictions = predictions.detach().cpu().numpy()

            # Optional: Print or evaluate predictions
            # print(f"Predictions in epoch {epoch + 1}:", preds)
            
            # print('labels', labels) 
            # for id in ids:
            #     print(tokenizer.decode(id))

            tr_loss += loss.item()

            nb_tr_steps += 1
            # nb_tr_examples += targets.size(0)
            
            if idx > 0 and (idx % 1000 == 0):
                loss_step = tr_loss/nb_tr_steps
                print(f"Training loss per 1000 training steps: {loss_step}")

            # # gradient clipping
            # torch.nn.utils.clip_grad_norm_(
            #     parameters=model.parameters(), max_norm=MAX_GRAD_NORM
            # )
            
            # backward pass
            optimizer.zero_grad()
            loss.backward()

            # Update weights
            optimizer.step()

        epoch_loss = tr_loss / nb_tr_steps
       
        tr_accuracy = accuracy_score(y_pred=predictions, y_true=references)
        print(f"Training loss epoch: {epoch_loss}")
        print(f"Training accuracy epoch: {tr_accuracy}")

        val_preds, val_labels = evaluate(dev_df, model, 'Validation')


        # At this point, the model is fine-tuned and ready for evaluation or inference

In [116]:
inputs, labels = tokenize(train_df)
train(inputs, labels)

Training loss per 1000 training steps: 0.022609295125710072


KeyboardInterrupt: 

In [95]:
all_inputs, all_labels = tokenize(dev_df)


In [96]:
len(all_inputs)

1003

In [97]:
len(all_labels)

1003

In [103]:

print(len([id2label[a.item()] for label in all_labels for l in label for a in l]))

# for label in all_labels:
#     for l in label:
#         print(l)
     
# #         print(l)


128384


In [112]:
def evaluate(df, model, type):
    all_inputs, all_labels = tokenize(df)
    loss, accuracy = 0, 0
    nb_steps = 0
    predictions = []
    references = []
    for idx, (inputs, labels) in enumerate(zip(all_inputs, all_labels)):  # Replace with actual batch generation
        ids = inputs[0]['input_ids'].to(device)
        mask = inputs[0]['attention_mask'].to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)

        loss, logits = outputs.loss, outputs.logits
        preds = torch.argmax(logits, dim=2)
        
        predictions.append([id2label[p.item()] for pred in preds for p in pred])
        references.append([id2label[l.item()] for label in labels for l in label])

        loss += loss.item()

        nb_steps += 1
        # nb_tr_examples += targets.size(0)
        
        if idx > 0 and (idx % 1000 == 0):
            loss_step = loss/nb_steps
            print(f"{type} loss per 1000 training steps: {loss_step}")

    epoch_loss = loss / nb_steps
    
    accuracy = accuracy_score(y_pred=predictions, y_true=references)
    print(f"{type} loss: {epoch_loss}")
    print(f"{type} accuracy: {accuracy}")
   
    return predictions, references


In [21]:
tokenizer.decode(a['input_ids'][0])
for id in a['input_ids'][0]:
    print(tokenizer.decode(id))

[CLS]
i
have
a
headache
going
fe
##rium
[SEP]
