In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
import nltk
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import *
from transformers import RobertaTokenizer, RobertaForTokenClassification, AlbertTokenizer
from transformers import AutoTokenizer, AutoModel
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertForTokenClassification, AdamW, AlbertForTokenClassification, DistilBertForTokenClassification, DistilBertTokenizer
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report
import os
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm
2023-02-04 15:02:02.932642: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-04 15:02:03.595292: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:
2023-02-04 15:02:03.595364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:


In [2]:
data = pd.read_csv("../data/labeled_dfs_all.csv")

In [3]:
#data.columns = ['paper_id',"word", "tag", "sentence_id"]
#data['word'] = data['word'].astype(str)
#data = data[data['tag'].isin(['O', 'B-software', 'B-version', 'I-version','I-software'])]

# note: above original, all the other token/labels pairs are removed, while these 
# other labels should be changed to 'O' to avoid the tokens to be removed too
# removing all publisher and url labels simplifies the problem with less ambiguity
# below we keep tokens for publishers and URLs with 'O' label, as well ref markers

data.columns = ['paper_id',"word", "tag", "sentence_id"]
data['word'] = data['word'].astype(str)
# replace 'B-publisher', 'B-url', 'I-publisher','I-url' and reference marker labels by 'O'
data['tag'] = data['tag'].replace(['B-publisher', 'B-url', 'B-bibr', 'B-table', 'B-figure', 'B-formula', 'I-publisher', 'I-url', 'I-bibr', 'I-table', 'I-figure', 'I-formula'], 'O')
data = data[data['tag'].isin(['O', 'B-software', 'B-version', 'I-version','I-software'])]


In [4]:
data.shape

(372641, 4)

In [5]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                     s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)

In [7]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]

In [8]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [9]:
tag_values = list(set(data["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [10]:
tag2idx

{'B-version': 0,
 'B-software': 1,
 'O': 2,
 'I-software': 3,
 'I-version': 4,
 'PAD': 5}

In [11]:
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

### Finetune SciBERT

In [12]:
MAX_LEN = 215
bs = 32

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_cased', do_lower_case=False)
model = BertForTokenClassification.from_pretrained('allenai/scibert_scivocab_cased', num_labels=len(tag2idx))
model.to('cuda')

loading file vocab.txt from cache at /home/lopez/.cache/huggingface/hub/models--allenai--scibert_scivocab_cased/snapshots/ddf0be025f8e432a1870e34811997ba6725bf04a/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /home/lopez/.cache/huggingface/hub/models--allenai--scibert_scivocab_cased/snapshots/ddf0be025f8e432a1870e34811997ba6725bf04a/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_cased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "ab

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31116, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [15]:
def tokenize_label_sentence(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        # below originally, it should extend labels under the BIO scheme, so extend tokens 
        # replacing B-label by I-label (otherwise we have B- everywhere at each token and wrong 
        # entity scores)
        #labels.extend([label] * n_subwords)
        # fixing the error 
        labels.extend([label])
        if n_subwords>0:
            for i in range(0, n_subwords-1):
                labels.extend([label.replace("B-", "I-")])
    return tokenized_sentence, labels

In [16]:
tokenized_label_text = [
    tokenize_label_sentence(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [17]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_label_text]
labels = [token_label_pair[1] for token_label_pair in tokenized_label_text]

In [18]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [19]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [20]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [21]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [22]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [23]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [24]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [25]:
epochs = 5
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### Training

In [26]:
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    loss_values.append(avg_train_loss)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|                                                                | 0/5 [00:00<?, ?it/s]

Average train loss: 0.018487649510033673


Epoch:  20%|███████████                                            | 1/5 [05:42<22:50, 342.56s/it]

Validation loss: 0.003313174086030234
Validation Accuracy: 0.9920336486276942
Validation F1-Score: 0.7627573858549687

Average train loss: 0.002266914839165478


Epoch:  40%|██████████████████████                                 | 2/5 [11:25<17:07, 342.51s/it]

Validation loss: 0.002618698397153904
Validation Accuracy: 0.9941269964335556
Validation F1-Score: 0.8248175182481753

Average train loss: 0.0011990576849914053


Epoch:  60%|█████████████████████████████████                      | 3/5 [17:07<11:25, 342.53s/it]

Validation loss: 0.00277330827124057
Validation Accuracy: 0.9945534191347496
Validation F1-Score: 0.8317757009345794

Average train loss: 0.0007047377218965046


Epoch:  80%|████████████████████████████████████████████           | 4/5 [22:50<05:42, 342.50s/it]

Validation loss: 0.002865833206702849
Validation Accuracy: 0.9949023104357265
Validation F1-Score: 0.8410351201478744

Average train loss: 0.0004427463623061259


Epoch: 100%|███████████████████████████████████████████████████████| 5/5 [28:32<00:00, 342.51s/it]

Validation loss: 0.002916993784244617
Validation Accuracy: 0.9951930531865405
Validation F1-Score: 0.8534080298786181






### Save the model

In [27]:
model_out_address = '../models/scibert_software_sent'

if not os.path.exists(model_out_address):
    os.makedirs(model_out_address) 

In [28]:
model_to_save = model.module if hasattr(model, 'module') else model 

In [29]:
output_model_file = os.path.join(model_out_address, "pytorch_model.bin")
output_config_file = os.path.join(model_out_address, "config.json")

In [30]:
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(model_out_address)

('../models/scibert_software_sent/vocab.txt',)

### Performace metrics

In [31]:
model.eval();

In [32]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(bs))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    with torch.no_grad():
        outputs = model(input_ids,
                        attention_mask=input_mask,)
        logits = outputs[0]
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    for i,mask in enumerate(input_mask):
        temp_1 = []
        temp_2 = []
        
        for j, m in enumerate(mask):
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : 
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        y_true.append(temp_1)
        y_pred.append(temp_2)

report = classification_report(y_true, y_pred,digits=4)

***** Running evaluation *****
  Num examples =1760
  Batch size = 32


In [33]:
print("F1 score: %f"%(f1_score(y_true, y_pred)))
print(report)

F1 score: 0.853408
           precision    recall  f1-score   support

 software     0.8169    0.8614    0.8386       404
  version     0.8934    0.9160    0.9046       119

micro avg     0.8339    0.8738    0.8534       523
macro avg     0.8343    0.8738    0.8536       523

