In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForTokenClassification
from tqdm import tqdm
from sklearn.metrics import classification_report

train_chem_path = r'/kaggle/input/bc5cdr-chem/train.tsv'
test_chem_path = r'/kaggle/input/bc5cdr-chem/test.tsv'
val_chem_path = r'/kaggle/input/bc5cdr-chem/devel.tsv'

train_disease_path = r'/kaggle/input/ncbi-disease/train.tsv'
test_disease_path = r'/kaggle/input/ncbi-disease/test.tsv'
val_disease_path = r'/kaggle/input/ncbi-disease/devel.tsv'

def load_data(file_path):
    try:
        data = pd.read_csv(file_path, delimiter='\t', header=None, names=["tokens", "labels"])
        print(f"Loaded data from {file_path} with shape {data.shape}")
        return data
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        raise

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer, label_to_id, max_len):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        if not isinstance(word, str) or not isinstance(label, str):
            continue  # Skip non-string entries

        tokenized_word = tokenizer.tokenize(word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] + [-100] * (len(tokenized_word) - 1))

    if len(tokenized_sentence) > max_len - 2:
        tokenized_sentence = tokenized_sentence[:max_len - 2]
        labels = labels[:max_len - 2]

    tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
    labels = [-100] + labels + [-100]
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
    attention_mask = [1] * len(input_ids)
    label_ids = [label_to_id.get(label, -100) for label in labels]

    while len(input_ids) < max_len:
        input_ids.append(0)
        attention_mask.append(0)
        label_ids.append(-100)

    return input_ids, label_ids, attention_mask

def process_dataset_with_special_tokens(dataset_path, tokenizer, label_to_id, max_length=128):
    data = load_data(dataset_path)
    input_ids = []
    tag_ids = []
    attention_masks = []

    grouped_data = data.groupby((data['tokens'] == '.').cumsum()).apply(lambda x: (x['tokens'].tolist(), x['labels'].tolist()))
    for sentence, labels in grouped_data:
        try:
            ids, label_ids, mask = tokenize_and_preserve_labels(sentence, labels, tokenizer, label_to_id, max_length)
            input_ids.append(ids)
            tag_ids.append(label_ids)
            attention_masks.append(mask)
        except Exception as e:
            print(f"Error processing sentence: {sentence}")
            print(f"Labels: {labels}")
            print(f"Exception: {e}")
            continue  # Skip problematic sentences

    return input_ids, tag_ids, attention_masks

class NERDataset(Dataset):
    def __init__(self, input_ids, tag_ids, attention_masks):
        self.input_ids = input_ids
        self.tag_ids = tag_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'tag_ids': torch.tensor(self.tag_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long)
        }

class NERModel(nn.Module):
    def __init__(self, num_tags):
        super(NERModel, self).__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=num_tags)

    def forward(self, input_ids, attention_mask, tags=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, labels=tags)
        return outputs.loss if tags is not None else outputs.logits

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

label_to_id = {'B': 0, 'I': 1, 'O': 2}



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [2]:
# Process train dataset for chemicals
train_chem_input_ids, train_chem_tag_ids, train_chem_attention_masks = process_dataset_with_special_tokens(train_chem_path, tokenizer, label_to_id, max_length=128)
train_chem_dataset = NERDataset(train_chem_input_ids, train_chem_tag_ids, train_chem_attention_masks)
train_chem_dataloader = DataLoader(train_chem_dataset, batch_size=4, shuffle=True)

# Process train dataset for NCBI diseases
train_disease_input_ids, train_disease_tag_ids, train_disease_attention_masks = process_dataset_with_special_tokens(train_disease_path, tokenizer, label_to_id, max_length=128)
train_disease_dataset = NERDataset(train_disease_input_ids, train_disease_tag_ids, train_disease_attention_masks)
train_disease_dataloader = DataLoader(train_disease_dataset, batch_size=4, shuffle=True)

# Combine chemical and NCBI disease datasets for training
combined_train_dataset = ConcatDataset([train_chem_dataset, train_disease_dataset])
combined_train_dataloader = DataLoader(combined_train_dataset, batch_size=4, shuffle=True)

# Process test dataset for chemicals
test_chem_input_ids, test_chem_tag_ids, test_chem_attention_masks = process_dataset_with_special_tokens(test_chem_path, tokenizer, label_to_id, max_length=128)
test_chem_dataset = NERDataset(test_chem_input_ids, test_chem_tag_ids, test_chem_attention_masks)
test_chem_dataloader = DataLoader(test_chem_dataset, batch_size=4, shuffle=False)

# Process test dataset for NCBI diseases
test_disease_input_ids, test_disease_tag_ids, test_disease_attention_masks = process_dataset_with_special_tokens(test_disease_path, tokenizer, label_to_id, max_length=128)
test_disease_dataset = NERDataset(test_disease_input_ids, test_disease_tag_ids, test_disease_attention_masks)
test_disease_dataloader = DataLoader(test_disease_dataset, batch_size=4, shuffle=False)

# Process validation dataset for chemicals
val_chem_input_ids, val_chem_tag_ids, val_chem_attention_masks = process_dataset_with_special_tokens(val_chem_path, tokenizer, label_to_id, max_length=128)
val_chem_dataset = NERDataset(val_chem_input_ids, val_chem_tag_ids, val_chem_attention_masks)
val_chem_dataloader = DataLoader(val_chem_dataset, batch_size=4, shuffle=False)

# Process validation dataset for NCBI diseases
val_disease_input_ids, val_disease_tag_ids, val_disease_attention_masks = process_dataset_with_special_tokens(val_disease_path, tokenizer, label_to_id, max_length=128)
val_disease_dataset = NERDataset(val_disease_input_ids, val_disease_tag_ids, val_disease_attention_masks)
val_disease_dataloader = DataLoader(val_disease_dataset, batch_size=4, shuffle=False)

# Initialize the model and optimizer
model = NERModel(num_tags=len(label_to_id))
optimizer = optim.Adam(model.parameters(), lr=5e-5)

Loaded data from /kaggle/input/bc5cdr-chem/train.tsv with shape (122729, 2)
Loaded data from /kaggle/input/ncbi-disease/train.tsv with shape (135615, 2)
Loaded data from /kaggle/input/bc5cdr-chem/test.tsv with shape (124676, 2)
Loaded data from /kaggle/input/ncbi-disease/test.tsv with shape (24488, 2)
Loaded data from /kaggle/input/bc5cdr-chem/devel.tsv with shape (117391, 2)
Loaded data from /kaggle/input/ncbi-disease/devel.tsv with shape (23959, 2)


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def train_model(model, train_dataloader, optimizer, num_epochs=3, device='cpu'):
    model.train()
    model.to(device)
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tags = batch['tag_ids'].long().to(device)
            loss = model(input_ids, attention_mask, tags)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss/len(train_dataloader))
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}")
def evaluate_model(model, dataloader, device='cpu'):
    model.eval()
    model.to(device)
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tags = batch['tag_ids'].long().to(device)

            logits = model(input_ids, attention_mask)
            loss_fct = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, logits.shape[-1])
            active_labels = torch.where(
                active_loss, tags.view(-1), torch.tensor(loss_fct.ignore_index).type_as(tags)
            )
            loss = loss_fct(active_logits, active_labels)

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=2).cpu().numpy()
            labels = tags.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / len(dataloader)
    return avg_loss, all_preds, all_labels

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_model(model, combined_train_dataloader, optimizer, num_epochs=3, device=device)

test_loss, test_preds, test_labels = evaluate_model(model, test_chem_dataloader, device=device)
val_loss, val_preds, val_labels = evaluate_model(model, val_chem_dataloader, device=device)

print(f"Test Loss: {test_loss}")
print(f"Validation Loss: {val_loss}")

Epoch 1/3: 100%|██████████| 3025/3025 [05:36<00:00,  8.99it/s, loss=0.0697]


Epoch 1/3, Loss: 0.06967389219148279


Epoch 2/3:  30%|██▉       | 899/3025 [01:42<04:00,  8.84it/s, loss=0.00984] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 3/3: 100%|██████████| 3025/3025 [05:44<00:00,  8.79it/s, loss=0.0275]


Epoch 3/3, Loss: 0.027530311551511488


Evaluating: 100%|██████████| 1623/1623 [00:53<00:00, 30.42it/s]
Evaluating: 100%|██████████| 1480/1480 [00:48<00:00, 30.37it/s]

Test Loss: 0.061801336536311306
Validation Loss: 0.05586650152900138





In [5]:
def generate_classification_report(labels, preds, label_to_id):
    id_to_label = {id: label for label, id in label_to_id.items()}
    labels_flat = []
    preds_flat = []

    for label_seq, pred_seq in zip(labels, preds):
        for label, pred in zip(label_seq, pred_seq):
            if label != -100:  # Ignore padding tokens
                labels_flat.append(id_to_label[label])
                preds_flat.append(id_to_label[pred])

    report = classification_report(labels_flat, preds_flat, digits=4)
    return report

test_report = generate_classification_report(test_labels, test_preds, label_to_id)
val_report = generate_classification_report(val_labels, val_preds, label_to_id)

print("Test Classification Report:\n", test_report)
print("Validation Classification Report:\n", val_report)

Test Classification Report:
               precision    recall  f1-score   support

           B     0.8985    0.8806    0.8895      5378
           I     0.7118    0.8145    0.7597      1628
           O     0.9930    0.9919    0.9925    117654

    accuracy                         0.9848    124660
   macro avg     0.8678    0.8957    0.8805    124660
weighted avg     0.9853    0.9848    0.9850    124660

Validation Classification Report:
               precision    recall  f1-score   support

           B     0.9120    0.8978    0.9048      5342
           I     0.7753    0.8251    0.7994      1744
           O     0.9931    0.9928    0.9930    110128

    accuracy                         0.9860    117214
   macro avg     0.8935    0.9052    0.8991    117214
weighted avg     0.9862    0.9860    0.9861    117214



In [6]:
import os

save_directory = "saved_scibert_model"

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

model_save_path = os.path.join(save_directory, "pytorch_model.bin")
torch.save(model.state_dict(), model_save_path)

tokenizer_save_path = os.path.join(save_directory, "tokenizer")
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to {model_save_path}")
print(f"Tokenizer saved to {tokenizer_save_path}")

Model saved to saved_scibert_model/pytorch_model.bin
Tokenizer saved to saved_scibert_model/tokenizer


In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForTokenClassification
from tqdm import tqdm
from sklearn.metrics import classification_report

# Define paths for the datasets
train_chem_path = r'/kaggle/input/bc5cdr-chem/train.tsv'
test_chem_path = r'/kaggle/input/bc5cdr-chem/test.tsv'
val_chem_path = r'/kaggle/input/bc5cdr-chem/devel.tsv'

train_disease_path = r'/kaggle/input/ncbi-disease/train.tsv'
test_disease_path = r'/kaggle/input/ncbi-disease/test.tsv'
val_disease_path = r'/kaggle/input/ncbi-disease/devel.tsv'

def load_data(file_path):
    try:
        data = pd.read_csv(file_path, delimiter='\t', header=None, names=["tokens", "labels"])
        print(f"Loaded data from {file_path} with shape {data.shape}")
        return data
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        raise

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer, label_to_id, max_len):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        if not isinstance(word, str) or not isinstance(label, str):
            continue  # Skip non-string entries

        tokenized_word = tokenizer.tokenize(word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] + [-100] * (len(tokenized_word) - 1))

    if len(tokenized_sentence) > max_len - 2:
        tokenized_sentence = tokenized_sentence[:max_len - 2]
        labels = labels[:max_len - 2]

    tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]
    labels = [-100] + labels + [-100]
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
    attention_mask = [1] * len(input_ids)
    label_ids = [label_to_id.get(label, -100) for label in labels]

    while len(input_ids) < max_len:
        input_ids.append(0)
        attention_mask.append(0)
        label_ids.append(-100)

    return input_ids, label_ids, attention_mask

def process_dataset_with_special_tokens(dataset_path, tokenizer, label_to_id, max_length=128):
    data = load_data(dataset_path)
    input_ids = []
    tag_ids = []
    attention_masks = []

    grouped_data = data.groupby((data['tokens'] == '.').cumsum()).apply(lambda x: (x['tokens'].tolist(), x['labels'].tolist()))
    for sentence, labels in grouped_data:
        try:
            ids, label_ids, mask = tokenize_and_preserve_labels(sentence, labels, tokenizer, label_to_id, max_length)
            input_ids.append(ids)
            tag_ids.append(label_ids)
            attention_masks.append(mask)
        except Exception as e:
            print(f"Error processing sentence: {sentence}")
            print(f"Labels: {labels}")
            print(f"Exception: {e}")
            continue  # Skip problematic sentences

    return input_ids, tag_ids, attention_masks

class NERDataset(Dataset):
    def __init__(self, input_ids, tag_ids, attention_masks):
        self.input_ids = input_ids
        self.tag_ids = tag_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'tag_ids': torch.tensor(self.tag_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long)
        }

class NERModel(nn.Module):
    def __init__(self, num_tags):
        super(NERModel, self).__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=num_tags)

    def forward(self, input_ids, attention_mask, tags=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, labels=tags)
        return outputs.loss if tags is not None else outputs.logits

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

label_to_id = {'B': 0, 'I': 1, 'O': 2}



config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [8]:
# Process train dataset for chemicals
train_chem_input_ids, train_chem_tag_ids, train_chem_attention_masks = process_dataset_with_special_tokens(train_chem_path, tokenizer, label_to_id, max_length=128)
train_chem_dataset = NERDataset(train_chem_input_ids, train_chem_tag_ids, train_chem_attention_masks)
train_chem_dataloader = DataLoader(train_chem_dataset, batch_size=4, shuffle=True)

# Process train dataset for NCBI diseases
train_disease_input_ids, train_disease_tag_ids, train_disease_attention_masks = process_dataset_with_special_tokens(train_disease_path, tokenizer, label_to_id, max_length=128)
train_disease_dataset = NERDataset(train_disease_input_ids, train_disease_tag_ids, train_disease_attention_masks)
train_disease_dataloader = DataLoader(train_disease_dataset, batch_size=4, shuffle=True)

# Combine chemical and NCBI disease datasets for training
combined_train_dataset = ConcatDataset([train_chem_dataset, train_disease_dataset])
combined_train_dataloader = DataLoader(combined_train_dataset, batch_size=4, shuffle=True)

# Process test dataset for chemicals
test_chem_input_ids, test_chem_tag_ids, test_chem_attention_masks = process_dataset_with_special_tokens(test_chem_path, tokenizer, label_to_id, max_length=128)
test_chem_dataset = NERDataset(test_chem_input_ids, test_chem_tag_ids, test_chem_attention_masks)
test_chem_dataloader = DataLoader(test_chem_dataset, batch_size=4, shuffle=False)

# Process test dataset for NCBI diseases
test_disease_input_ids, test_disease_tag_ids, test_disease_attention_masks = process_dataset_with_special_tokens(test_disease_path, tokenizer, label_to_id, max_length=128)
test_disease_dataset = NERDataset(test_disease_input_ids, test_disease_tag_ids, test_disease_attention_masks)
test_disease_dataloader = DataLoader(test_disease_dataset, batch_size=4, shuffle=False)

# Process validation dataset for chemicals
val_chem_input_ids, val_chem_tag_ids, val_chem_attention_masks = process_dataset_with_special_tokens(val_chem_path, tokenizer, label_to_id, max_length=128)
val_chem_dataset = NERDataset(val_chem_input_ids, val_chem_tag_ids, val_chem_attention_masks)
val_chem_dataloader = DataLoader(val_chem_dataset, batch_size=4, shuffle=False)

# Process validation dataset for NCBI diseases
val_disease_input_ids, val_disease_tag_ids, val_disease_attention_masks = process_dataset_with_special_tokens(val_disease_path, tokenizer, label_to_id, max_length=128)
val_disease_dataset = NERDataset(val_disease_input_ids, val_disease_tag_ids, val_disease_attention_masks)
val_disease_dataloader = DataLoader(val_disease_dataset, batch_size=4, shuffle=False)

# Initialize the model and optimizer
model = NERModel(num_tags=len(label_to_id))
optimizer = optim.Adam(model.parameters(), lr=5e-5)

Loaded data from /kaggle/input/bc5cdr-chem/train.tsv with shape (122729, 2)
Loaded data from /kaggle/input/ncbi-disease/train.tsv with shape (135615, 2)
Loaded data from /kaggle/input/bc5cdr-chem/test.tsv with shape (124676, 2)
Loaded data from /kaggle/input/ncbi-disease/test.tsv with shape (24488, 2)
Loaded data from /kaggle/input/bc5cdr-chem/devel.tsv with shape (117391, 2)
Loaded data from /kaggle/input/ncbi-disease/devel.tsv with shape (23959, 2)


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def train_model(model, train_dataloader, optimizer, num_epochs=3, device='cpu'):
    model.train()
    model.to(device)
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tags = batch['tag_ids'].long().to(device)
            loss = model(input_ids, attention_mask, tags)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss/len(train_dataloader))
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}")
def evaluate_model(model, dataloader, device='cpu'):
    model.eval()
    model.to(device)
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tags = batch['tag_ids'].long().to(device)

            logits = model(input_ids, attention_mask)
            loss_fct = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, logits.shape[-1])
            active_labels = torch.where(
                active_loss, tags.view(-1), torch.tensor(loss_fct.ignore_index).type_as(tags)
            )
            loss = loss_fct(active_logits, active_labels)

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=2).cpu().numpy()
            labels = tags.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / len(dataloader)
    return avg_loss, all_preds, all_labels

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_model(model, combined_train_dataloader, optimizer, num_epochs=3, device=device)

test_loss, test_preds, test_labels = evaluate_model(model, test_chem_dataloader, device=device)
val_loss, val_preds, val_labels = evaluate_model(model, val_chem_dataloader, device=device)

print(f"Test Loss: {test_loss}")
print(f"Validation Loss: {val_loss}")

Epoch 1/3: 100%|██████████| 3025/3025 [05:43<00:00,  8.81it/s, loss=0.0646]


Epoch 1/3, Loss: 0.06458271463942321


Epoch 2/3: 100%|██████████| 3025/3025 [05:43<00:00,  8.81it/s, loss=0.0292] 


Epoch 2/3, Loss: 0.029223973463361117


Epoch 3/3: 100%|██████████| 3025/3025 [05:43<00:00,  8.81it/s, loss=0.0203] 


Epoch 3/3, Loss: 0.020258636973925274


Evaluating: 100%|██████████| 1623/1623 [00:53<00:00, 30.30it/s]
Evaluating: 100%|██████████| 1480/1480 [00:48<00:00, 30.31it/s]

Test Loss: 0.04753487426515764
Validation Loss: 0.0516041841521975





In [11]:
def generate_classification_report(labels, preds, label_to_id):
    id_to_label = {id: label for label, id in label_to_id.items()}
    labels_flat = []
    preds_flat = []

    for label_seq, pred_seq in zip(labels, preds):
        for label, pred in zip(label_seq, pred_seq):
            if label != -100:  # Ignore padding tokens
                labels_flat.append(id_to_label[label])
                preds_flat.append(id_to_label[pred])

    report = classification_report(labels_flat, preds_flat, digits=4)
    return report

test_report = generate_classification_report(test_labels, test_preds, label_to_id)
val_report = generate_classification_report(val_labels, val_preds, label_to_id)

print("Test Classification Report:\n", test_report)
print("Validation Classification Report:\n", val_report)

Test Classification Report:
               precision    recall  f1-score   support

           B     0.8878    0.9248    0.9060      5375
           I     0.7226    0.8704    0.7896      1628
           O     0.9956    0.9909    0.9932    117601

    accuracy                         0.9865    124604
   macro avg     0.8687    0.9287    0.8963    124604
weighted avg     0.9874    0.9865    0.9868    124604

Validation Classification Report:
               precision    recall  f1-score   support

           B     0.8954    0.9251    0.9100      5341
           I     0.7529    0.8576    0.8018      1734
           O     0.9947    0.9909    0.9928    110050

    accuracy                         0.9859    117125
   macro avg     0.8810    0.9245    0.9015    117125
weighted avg     0.9866    0.9859    0.9862    117125



In [12]:
import os

# Directory to save the model and tokenizer
save_directory = "saved_biobert_model"

if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the model
model_save_path = os.path.join(save_directory, "pytorch_model.bin")
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer_save_path = os.path.join(save_directory, "tokenizer")
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to {model_save_path}")
print(f"Tokenizer saved to {tokenizer_save_path}")

Model saved to saved_biobert_model/pytorch_model.bin
Tokenizer saved to saved_biobert_model/tokenizer


In [13]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer
biobert_tokenizer = BertTokenizer.from_pretrained("saved_biobert_model/tokenizer")

# Load the model
biobert_model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)

# Load the saved state dict
state_dict = torch.load("saved_biobert_model/pytorch_model.bin")
biobert_model.load_state_dict(state_dict, strict=False)

# Move the model to the device (GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
biobert_model.to(device)

print("Both models loaded successfully.")

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Both models loaded successfully.


In [22]:
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn

# Load models and tokenizer
scibert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
biobert_model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

# Define the combined classifier class
class CombinedClassifier(nn.Module):
    def __init__(self, scibert_hidden_size, biobert_hidden_size, num_labels):
        super(CombinedClassifier, self).__init__()
        self.classifier = nn.Linear(scibert_hidden_size + biobert_hidden_size, num_labels)

    def forward(self, scibert_outputs, biobert_outputs):
        combined_features = torch.cat((scibert_outputs, biobert_outputs), dim=-1)
        logits = self.classifier(combined_features)
        return logits

# Number of labels for the classifier
num_labels = 3
classifier = CombinedClassifier(scibert_model.config.hidden_size, biobert_model.config.hidden_size, num_labels)

# Input text
text = "The patient was diagnosed with type 2 diabetes mellitus and prescribed metformin"
entities = [(27, 54, 'DISEASE'), (70, 79, 'DRUG')]  # Example entities with (start, end, label)
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512, return_offsets_mapping=True)
offset_mapping = inputs.pop("offset_mapping")

tokens_output = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

# Initialize labels with 'O'
token_labels = ['O'] * len(tokens_output)

# Assign 'B' and 'I' labels based on entities
for entity_start, entity_end, _ in entities:
    inside_entity = False
    for i, (offset_start, offset_end) in enumerate(offset_mapping.squeeze().tolist()):
        if offset_start == 0 and offset_end == 0:  # Ignore special tokens
            continue
        if offset_start >= entity_start and offset_end <= entity_end:
            if not inside_entity:
                token_labels[i] = 'B'
                inside_entity = True
            else:
                token_labels[i] = 'I'
        elif inside_entity:
            break

print("Tokens       =", tokens_output)
print("Final Labels =", token_labels)

Tokens       = ['[CLS]', 'the', 'patient', 'was', 'diagnosed', 'with', 'type', '2', 'diabetes', 'mellitus', 'and', 'prescribed', 'metformin', '[SEP]']
Final Labels = ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
