<a href="https://colab.research.google.com/github/hushee69/biobert-relation-extraction/blob/main/gad_dataset_with_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# datasets link: https://drive.google.com/open?id=1-jDKGcXREb2X9xTFnuiJ36PvsqoyHWcw

!pip install transformers



In [2]:
import numpy as np;
import pandas as pd;

import torch;

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig;
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler;
from transformers import get_linear_schedule_with_warmup;

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score;

In [3]:
from google.colab import drive;

drive.mount('/content/gdrive');

Mounted at /content/gdrive


In [4]:
path='/content/gdrive/MyDrive/biobert_re/';

In [5]:
device = torch.device('cuda');

SEED = 42;

torch.manual_seed(SEED);
torch.backends.cudnn.deterministic = True;

In [6]:
"""
    Params:
        filepath: path of the dataset
        tokenizer: tokenizer to use
        maxlen: maxlength of text
"""
def load_and_process_gad_train_data(filepath, tokenizer, maxlen=512, train_percentage=0.7):
    # load dataset
    df = pd.read_csv(filepath, header=None, delimiter='\t', names=['sentence', 'label']);

    sentences = df.sentence.values;
    labels = df.label.values;
    
    input_ids = [];
    attention_masks = [];
    
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        );

        input_ids.append(encoded_dict['input_ids']);
        attention_masks.append(encoded_dict['attention_mask']);

    # convert lists into tensors
    input_ids = torch.cat(input_ids, dim=0);
    attention_masks = torch.cat(attention_masks, dim=0);
    labels_tensor = torch.tensor(labels);

    dataset = TensorDataset(input_ids, attention_masks, labels_tensor);

    train_size = int(train_percentage * len(dataset));
    val_size = len(dataset) - train_size;

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size]);

    return (train_dataset, val_dataset);

In [11]:
def train_gad_model(model, data, optimizer=AdamW, batch_size=32, epochs=3):
    max_val_loss = np.float('inf');

    train_ds = data[0];
    val_ds = data[1];

    train_dataloader = DataLoader(
        train_ds,
        sampler=RandomSampler(train_ds),
        batch_size=batch_size
    );

    val_dataloader = DataLoader(
        val_ds,
        sampler=SequentialSampler(val_ds),
        batch_size=batch_size
    );

    for e in range(epochs):
        train_loss = 0;
        train_acc = 0;

        model.train();

        optim = optimizer(model.parameters(), lr=2e-5, eps=1e-8);

        scheduler = get_linear_schedule_with_warmup(
            optim,
            num_warmup_steps=0,
            num_training_steps=len(train_dataloader) * epochs
        );

        for batch in train_dataloader:
            b_input_ids = batch[0].to(device);
            b_input_mask = batch[1].to(device);
            b_labels = batch[2].to(device);

            model.zero_grad();

            output = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            );

            loss = output['loss'];
            preds = output['logits'].detach().cpu().numpy();
            labels = b_labels.to('cpu').numpy();
            preds = np.argmax(preds, axis=1);

            train_loss += loss.item();
            train_acc += accuracy_score(labels, preds);

            loss.backward();

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0);

            optim.step();

            scheduler.step();
        
        avg_train_loss = train_loss / len(train_dataloader);
        avg_train_acc = train_acc / len(train_dataloader);

        print('average training loss for epoch: {}'.format(avg_train_loss));
        print('average training accuracy for epoch: {}'.format(avg_train_acc));

        # validation
        val_loss = 0;
        val_acc = 0;

        model.eval();

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device);
            b_attention_mask = batch[1].to(device);
            b_labels = batch[2].to(device);

            with torch.no_grad():
                output = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_attention_mask,
                    labels=b_labels
                );
            
            loss = output['loss'];
            preds = output['logits'].detach().cpu().numpy();
            labels = b_labels.to('cpu').numpy();
            preds = np.argmax(preds, axis=1);

            val_loss += loss.item();
            val_acc += accuracy_score(labels, preds);
        
        avg_val_loss = val_loss / len(val_dataloader);
        avg_val_acc = val_acc / len(val_dataloader);

        if avg_val_loss < max_val_loss:
            max_val_loss = avg_val_loss;
            torch.save(model.state_dict(), 'best_model.pt');

        print('average validation loss for epoch: {}'.format(avg_val_loss));
        print('average validation accuracy for epoch: {}'.format(avg_val_acc));

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased');

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
);

model.cuda();

epochs = 2;

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
for i in range(10):
    data = load_and_process_gad_train_data(path + 'GAD/' + str(i + 1) + '/train.tsv', tokenizer=tokenizer, maxlen=128);
    train_gad_model(model, data);

average training loss for epoch: 0.6302057779970623
average training accuracy for epoch: 0.6412356321839081
average validation loss for epoch: 0.6741531868775685
average validation accuracy for epoch: 0.6504928315412186
average training loss for epoch: 0.5262822040489742
average training accuracy for epoch: 0.751539408866995
average validation loss for epoch: 0.5387268020047082
average validation accuracy for epoch: 0.7413754480286738
average training loss for epoch: 0.4455633171967098
average training accuracy for epoch: 0.8008620689655173
average validation loss for epoch: 0.5501131024625566
average validation accuracy for epoch: 0.7470206093189964
average training loss for epoch: 0.44360336647147225
average training accuracy for epoch: 0.803745894909688
average validation loss for epoch: 0.3960328121980031
average validation accuracy for epoch: 0.8263888888888888
average training loss for epoch: 0.3742846606742768
average training accuracy for epoch: 0.8443144499178983
average valid

In [14]:
"""
    Params:
        filepath: path of the dataset
        tokenizer: tokenizer to use
        maxlen: maxlength of text
"""
def load_and_process_gad_test_data(filepath, tokenizer, maxlen=512):
    # load dataset
    df = pd.read_csv(filepath, delimiter='\t', header=None, names=['wrong_index', 'sentence', 'label']);

    sentences = df.sentence.values;
    labels = df.label.values;
    
    input_ids = [];
    attention_masks = [];
    
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        );

        input_ids.append(encoded_dict['input_ids']);
        attention_masks.append(encoded_dict['attention_mask']);

    # convert lists into tensors
    input_ids = torch.cat(input_ids, dim=0);
    attention_masks = torch.cat(attention_masks, dim=0);
    labels_tensor = torch.tensor(labels);

    dataset = TensorDataset(input_ids, attention_masks, labels_tensor);

    return dataset;

In [15]:
def test_euadr_model(model, data, batch_size=32):
    model.load_state_dict(torch.load('best_model.pt'));

    model.eval();

    ds = data;

    test_dataloader = DataLoader(
        ds,
        sampler=SequentialSampler(ds),
        batch_size=batch_size,
    );

    preds_list, real_labels_list = [], [];

    for batch_nbr, batch in enumerate(test_dataloader):
        b_input_ids = batch[0].to(device);
        b_input_mask = batch[1].to(device);
        b_labels = batch[2].to(device);

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask
            );
        
        logits = outputs['logits'].detach().cpu().numpy();
        label_ids = b_labels.to('cpu').numpy();
        preds = np.argmax(logits, axis=1);

        preds_list.append(preds);
        real_labels_list.append(label_ids);

        print('finished batch {}'.format(batch_nbr));
    
    ret = (
        preds_list,
        real_labels_list
    );
    
    return ret;

In [18]:
# load test_accumulated.tsv file
avg_test_accuracy = 0;

test_data = load_and_process_gad_test_data(path + 'GAD/test_accumulated.tsv', tokenizer=tokenizer);

ret = test_euadr_model(model, test_data);

finished batch 0
finished batch 1
finished batch 2
finished batch 3
finished batch 4
finished batch 5
finished batch 6
finished batch 7
finished batch 8
finished batch 9
finished batch 10
finished batch 11
finished batch 12
finished batch 13
finished batch 14
finished batch 15
finished batch 16
finished batch 17
finished batch 18
finished batch 19
finished batch 20
finished batch 21
finished batch 22
finished batch 23
finished batch 24
finished batch 25
finished batch 26
finished batch 27
finished batch 28
finished batch 29
finished batch 30
finished batch 31
finished batch 32
finished batch 33
finished batch 34
finished batch 35
finished batch 36
finished batch 37
finished batch 38
finished batch 39
finished batch 40
finished batch 41
finished batch 42
finished batch 43
finished batch 44
finished batch 45
finished batch 46
finished batch 47
finished batch 48
finished batch 49
finished batch 50
finished batch 51
finished batch 52
finished batch 53
finished batch 54
finished batch 55
fi

In [19]:
preds = np.concatenate(ret[0]);
real = np.concatenate(ret[1]);

print('test predictions: {}'.format(preds));
print('real values: {}'.format(real));

test_acc = accuracy_score(real, preds);
test_prec = precision_score(real, preds);
test_rec = recall_score(real, preds);
test_f1 = f1_score(real, preds);

print('test accuracy: {}'.format(test_acc));
print('test precision: {}'.format(test_prec));
print('test recall: {}'.format(test_rec));
print('test f1: {}'.format(test_f1));

test predictions: [1 1 1 ... 0 0 0]
real values: [1 1 1 ... 0 0 0]
test accuracy: 0.9928705440900563
test precision: 0.9971212666426772
test recall: 0.9892895394501964
test f1: 0.9931899641577061
