＜The overall flow＞

STEP1: Set hyperparameters, import libraries

STEP2: Define each function

STEP3: Create definition extraction model, divide D_process

STEP4: Assign D_process variable-variable definition pairs to J-type Templates

STEP5: fine-tune BERT model

STEP6: Load the fine-tuned model and perform definition extraction


-------------------------------------------------------------------

STEP1: Set hyperparameters, import libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
MODEL_NAME="microsoft/deberta-v3-large"
target_token="代"

In [3]:
import os
SEED = 1234 # Random seed value to ensure that the value does not shift between runs.
MAX_LENGTH = 128 # Length of input to BERT (can be up to 512). If the input text exceeds the input length, discard the entire length e.g. [32, 128, 256].

LEARNING_RATE = 1e-5 # Optimiser learning rate e.g. [5e-4, 1e-5, 1e-6].
EPOCH = 3 # Number of times to turn learning e.g. [8, 16].
BATCH_SIZE = 8 # Batch size when learning e.g. [8, 32].
EVAL_BATCH_SIZE = 1 # Batch size during prediction
DO_2ND_FINETUNING=True
DO_3RD_FINETUNING=True

max_length = 512

OUTPUT_DIR='/content/drive/MyDrive/definition_extraction/result/'+(MODEL_NAME.replace("/","-"))

import datetime
date=str(datetime.datetime.now()).replace(" ","_").replace(":","").replace(".","")
MODEL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "pytorch_model"+date+".bin") # Destination of model export

In [4]:
MODEL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "symlink_only.bin")

In [5]:
!pip install transformers fugashi ipadic pytorch-lightning
!pip install activations



In [6]:
import itertools
import random
import json
import tqdm
import numpy as np
import unicodedata

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import TrainingArguments,Trainer
from inspect import currentframe
from torch.nn import CrossEntropyLoss
from torch.nn.modules import BCEWithLogitsLoss
import torch.nn as nn


def print_(*args):
    """print param and its name"""
    names = {id(v): k for k, v in currentframe().f_back.f_locals.items()}
    print('\n'.join([names.get(id(arg), '???') + ' = ' + repr(arg) for arg in args]))

STEP2: Define each function

In [7]:
import random
import numpy as np
import torch
# Fix random seeds

def torch_fix_seed(seed=223):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

torch_fix_seed()

In [8]:

import copy

def encode_for_BERT(tokenizer, text, primary,symbol, max_length,id=None):
    """ Convert a sentence into an input to BERT
    Parameters
    ----------
    text:str
      Original text
    primary:dic
      Definition
    symbol:dic
      Variables
    max_length:int
      512

    Returns
    -------
    encoding:dic
      input_ids:tensor
        Sentence that has been id'd
      token_type_ids:tensor
        Basically irrelevant ... All 0
      attention_mask:tensor
        0 for [PAD], 1 for others
      start_positions:tensor
      end_positions:tensor
        One-hot-vector indicating start and end positions
    """

    # Convert the variable you want into a special token
    symbol_start=symbol["start"]
    symbol_end=symbol["end"]
    symbol_text=symbol["text"]
    symbol_len=symbol_end-symbol_start

    text=text[:symbol_start]+target_token+text[symbol_end:]

    if primary!=[] and primary[0]!=None:
        primary_lst = sorted(primary, key=lambda x: x['start'])
        splitted = [] # Add the string after the split.
        position = 0
        for primary_num,primary in enumerate(primary_lst):
            start=copy.deepcopy(primary["start"])
            end=copy.deepcopy(primary["end"])
            # Fix for sentence length shift if the description part is after target_token
            if primary["start"]>=symbol_end:
              start+=len(target_token)-symbol_len
              end+=len(target_token)-symbol_len

            # Label 0 for anything that is not a definition
            splitted.append({'text':text[position:start], 'label':0})
            # Label for definition
            splitted.append({'text':text[start:end], 'label':primary_num+1})
            position = end
        splitted.append({'text': text[position:], 'label':0})
        splitted = [ s for s in splitted if s['text'] ] # Exclude strings of length 0

    else:
        splitted=[{'text':text, 'label':0}]

    # Tokenize and label each split string
    tokens = [] # Add tokens
    labels = [] # Add labels for the tokens
    for text_splitted in splitted:
        text = text_splitted['text'].lstrip().rstrip()
        label = text_splitted['label']
        tokens_splitted = tokenizer.tokenize(text)

        if MODEL_NAME=="microsoft/deberta-base" :
          if tokens_splitted!=[] and tokens_splitted[0]!="" and tokens_splitted[0][0]!="Ġ":
            tokens_splitted[0]= "Ġ"+ tokens_splitted[0]
        if MODEL_NAME=="microsoft/deberta-v3-base" or MODEL_NAME=="microsoft/deberta-v3-large":
          if tokens_splitted!=[] and tokens_splitted[0]!="" and tokens_splitted[0][0]!="▁":
            tokens_splitted[0]= "▁"+ tokens_splitted[0]

        labels_splitted = [label] * len(tokens_splitted)
        tokens.extend(tokens_splitted)
        labels.extend(labels_splitted)


    # Encode and put into a format that can be entered into BERT
    if MODEL_NAME=="microsoft/deberta-base" :# Remove the leading "Ġ"
      tokens[0]=tokens[0].replace("Ġ","")
    if MODEL_NAME=="microsoft/deberta-v3-base" or MODEL_NAME=="microsoft/deberta-v3-large" :# Remove the leading "_"
      tokens[0]=tokens[0].replace("▁","")

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoding = tokenizer.prepare_for_model(
        input_ids,
        max_length=max_length,
        padding='max_length',
        truncation=True
    ) # Convert input_ids to encoding
    # Label 0 for special tokens [CLS] and [SEP]
    labels = [0] + labels[:max_length-2] + [0]
    #  Label 0 for special tokens [PAD]
    labels = labels + [0]*( max_length - len(labels) )

    # Leave variables
    symbol_tokens=tokenizer.tokenize(symbol_text)
    symbol_ids=tokenizer.convert_tokens_to_ids(symbol_tokens)

    # Use [CLS] tokens instead of answers for data for which no definition exists
    if labels==[0]*max_length:
      encoding["start_positions"]=[1]+[0]*(max_length-1)
      encoding["end_positions"]=[1]+[0]*(max_length-1)
      return encoding

    start_positions_label=[0]* (max_length)
    end_positions_label=[0]* (max_length)
    for primary_num in range(1,max(labels)+1):
      label_index=[index for index, label in enumerate(labels) if label == primary_num]
      start_positions_label[label_index[0]]=1
      end_positions_label[label_index[-1]]=1

    encoding["start_positions"]=start_positions_label
    encoding["end_positions"]=end_positions_label

    return encoding


In [9]:
! pip install sentencepiece
from transformers import AutoTokenizer,AutoModel,AutoConfig
if MODEL_NAME=="bert-base-uncased" or MODEL_NAME=="bert-large-uncased" or MODEL_NAME=="allenai/scibert_scivocab_uncased":
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  tokenizer.add_special_tokens({"additional_special_tokens": [target_token]})
elif MODEL_NAME=="microsoft/deberta-base" or MODEL_NAME=="microsoft/deberta-v3-base" or MODEL_NAME=="microsoft/deberta-v3-large":
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
import re
separate=[" ", "," , "[", "]" ,"|" ,"{" ,"}" ,"<",">","?" ,"." ,"/" ,"%","&","'","(",")","=","~","+","*"]

def mark_same_symbol(text,symbol,primary_lst):
  symbol_start=symbol["start"]
  symbol_end=symbol["end"]
  symbol_text=symbol["text"]
  symbol_len=symbol_end-symbol_start

  symbol_text_for_search=symbol_text.replace("(","\(").replace(")","\)").replace("{","\{").replace("}","\}").replace("[","\[").replace("]","\]").replace("+","\+").replace(".","\.").replace("-","\-").replace("?","\?").replace("^","\^").replace("*","\*").replace("|","\|")
  match_entity=re.finditer(symbol_text_for_search,text)
  symbol_span_lst=[m.span() for m in match_entity]
  text+=" "

  # Treat the original symbol as [target_token] separately
  if (symbol_start, symbol_end) in symbol_span_lst:
    symbol_span_lst.remove((symbol_start, symbol_end))
  # Make only independent words, don't remove appel, Σa_i etc. when you want to remove "a"
  symbol_span_lst=[span for span in symbol_span_lst if (text[span[0]-1] in separate) and (text[span[1]] in separate)]
  if None in primary_lst:
    primary_lst.remove(None)
  # Do not hang on to the definition point
  if primary_lst!=[]:
    symbol_span_lst=[span for span in symbol_span_lst if not(primary_lst[0]["start"] <=span[0] and span[0] <= primary_lst[0]["end"] ) ]


  text_splitted=""
  pos_now=0
  for span in symbol_span_lst:
    text_splitted+=text[pos_now:span[0]]
    text_splitted+=target_token
    pos_now=span[1]

    if symbol["start"]>span[1]:
      symbol["start"]+=len(target_token)-symbol_len
      symbol["end"]+=len(target_token)-symbol_len
    for primary in primary_lst:
      if primary["start"]>span[1]:
        primary["start"]+=len(target_token)-symbol_len
        primary["end"]+=len(target_token)-symbol_len
  text_splitted+=text[pos_now:]

  return text_splitted,symbol,primary_lst

In [11]:
def create_dataset(tokenizer, dataset, max_length,for_train=True):
    """
    Format the dataset into a form that can be input into a data loader
    """
    dataset_for_loader = []
    for data_example in dataset:
      for entity in data_example["entity"]:
        text = data_example['text']
        primary=entity["PRIMARY"]
        symbol=entity["SYMBOL"]

        text,symbol,primary=mark_same_symbol(text,symbol,primary)

        if for_train:
          encoding = encode_for_BERT(tokenizer,text, primary,symbol,max_length)
        else:
          encoding = encode_for_BERT(tokenizer,text, primary,symbol,max_length,id)

        encoding = { k: torch.tensor(v) for k, v in encoding.items() }
        dataset_for_loader.append(encoding)
    return dataset_for_loader


STEP3: Create of definition extraction model, divide D_process

In [12]:
# DeBERTa-v3
class BertDefinitionExtraction(nn.Module):
  def __init__(self, num_labels=1):
      super().__init__()
      self.config = AutoConfig.from_pretrained(MODEL_NAME) # Load a file with pre-trained BERT settings
      self.bert=AutoModel.from_pretrained(MODEL_NAME)
      self.start_output = nn.Linear(self.config.hidden_size, 1)
      self.end_output = nn.Linear(self.config.hidden_size, 1)

  def forward(
      self,
      input_ids,
      token_type_ids=None,
      attention_mask=None,
      # label=None,
      start_positions=None,
      end_positions=None,
      symbol=None,
      id=None
    ):
      outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
      ) # Input token IDs etc. to BERT and obtain outputs

      sequence_outputs = outputs[0] # Extract only the final output vector of BERT

      self.start_logits = self.start_output(sequence_outputs) # Convert vectors to vectors of class number dimensions
      self.end_logits = self.end_output(sequence_outputs)


      loss_fct = CrossEntropyLoss()
      # loss=loss_fct(logits.view(-1),label.view(-1).float())
      if start_positions is not None and end_positions is not None:
        start_loss = loss_fct(self.start_logits.view(-1), start_positions.view(-1).float())
        end_loss = loss_fct(self.end_logits.view(-1), end_positions.view(-1).float())
        self.loss=(start_loss+end_loss)/2
        return self.loss.sum()
      else :
        return self

In [13]:
model =BertDefinitionExtraction()
model.to("cuda")

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

BertDefinitionExtraction(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=T

In [14]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=LEARNING_RATE) # Pass model.parameters() for updating model parameters

In [15]:
def fine_tuning(dataloader_train,dataloader_val):
  best_loss = None
  for epoch in tqdm.notebook.tqdm(range(EPOCH), desc="Epoch"): # Repeat EPOCH times
    model.train() # Put the model in training mode
    for batch in tqdm.notebook.tqdm(dataloader_train, desc="Training"): #  Extract a batch from the training data loader
      batch={ k: v.cuda() for k, v in batch.items() }
      loss = model(**batch) # Get the predictions of the model and the error calculated inside the model

      loss.backward() # Error back propagation
      optimizer.step() # Update model parameters based on gradients
      #scheduler.step() # Reduce the learning rate of the optimizer

      optimizer.zero_grad() # Reset the gradient as it is not used after parameter update

    model.eval() #  Put the model in evaluation mode
    dev_outputs, dev_labels = [], []
    for batch in tqdm.notebook.tqdm(dataloader_val, desc="Evaluating"): # Extract the batch from the development data loader
      batch={ k: v.cuda() for k, v in batch.items() }
      with torch.no_grad(): # Since no learning is performed, reduce costs and increase speed by omitting calculations that are only relevant for learning
        outputs = model(**batch) # make predictions of the model results
      loss = outputs.cpu() # Send the model results to the CPU since they are still on the GPU

    print(f"Epoch {epoch+1} : Dev loss", loss)
    if best_loss is None or loss <= best_loss: # First time evaluating or updating the best score
      best_loss = loss # Update best score

      #  The following is a template for saving the model
      model_to_save = model.module if hasattr(model, "module") else model
      torch.save(model_to_save.state_dict(), MODEL_OUTPUT_PATH)
  return

In [16]:
process_name_dic={"crystallization":"CRYST",
           "CSTR":"CSTR",
           "Biodiesel":"BD",
           "Czochralski":"CZ",
           "shell_and_tube_exchanger":"STHE",
           "All":"process"
           }

accuracy_each_process={}
recall_each_process={}
precision_each_process={}
F1_each_process={}
for process_name in process_name_dic.keys():
  accuracy_each_process[process_name]=[]
  recall_each_process[process_name]=[]
  precision_each_process[process_name]=[]
  F1_each_process[process_name]=[]

In [17]:
seed=1 #Divide the D_process. Experiments were conducted in 10 different ways, seed=1~10.

In [18]:

with open("/content/drive/MyDrive/definition_extraction/dataset/process/no_ordered_selected_process_separated_by_paper.json", 'r') as f:
  process_papers_dataset=json.load(f)

process_dataset_train=[]
process_dataset_val=[]
process_dataset_test={}

for process_name,specific_process_papers in process_papers_dataset.items():
  process_dataset_test[process_name]=[]
  if process_name=="shell_and_tube_exchanger":
    test_papers_num=2
  else:
    test_papers_num=3

  random.seed(seed)
  specific_process_papers=list(specific_process_papers.values())
  indices = [i for i in range(len(specific_process_papers))]
  randomized_indices = random.sample(indices, len(indices))
  print(randomized_indices[:test_papers_num])
  print(randomized_indices[test_papers_num:len(indices)-1])
  print(randomized_indices[-1])
  test_papers = [data for i,paper in enumerate(specific_process_papers) for data in paper if i in randomized_indices[:test_papers_num]]
  train_papers = [data for i,paper in enumerate(specific_process_papers) for data in paper if i in randomized_indices[test_papers_num:len(indices)-1]]
  val_papers= [data for i,paper in enumerate(specific_process_papers) for data in paper if i == randomized_indices[-1]]




   #no exclude
  print(process_name)
  process_dataset_train.extend(train_papers)
  process_dataset_val.extend(val_papers)

  # exclude
  #if process_name !="crystallization":
   # print(process_name)
   # process_dataset_train.extend(train_papers)
   # process_dataset_val.extend(val_papers)
 # else:
   # print("exclude")


  process_dataset_test[process_name].extend(test_papers)


random.seed(seed)# No need to change here as it is a shuffle within val,train
dataset_val_for_loader = create_dataset(
    tokenizer, process_dataset_val, max_length
)
dataset_train_for_loader = create_dataset(
    tokenizer, process_dataset_train, max_length
)

# Create a data loader
process_dataloader_train = DataLoader(
    dataset_train_for_loader, batch_size=8, shuffle=True,num_workers=8
)
process_dataloader_val = DataLoader(dataset_val_for_loader, batch_size=1,num_workers=8)

process_dataloader_test={}
for process_name,specific_dataset_test in process_dataset_test.items():
  dataset_test_for_loader= create_dataset(
      tokenizer, specific_dataset_test, max_length,for_train=False
  )
  process_dataloader_test[process_name] = DataLoader(dataset_test_for_loader, batch_size=1,num_workers=8,shuffle=False)


[2, 1, 4]
[0, 3, 5, 7, 9, 8]
6
Biodiesel
[2, 9, 1]
[4, 0, 3, 5, 7, 10, 8]
6
crystallization
[1, 4]
[0, 2, 5, 6]
3
shell_and_tube_exchanger
[2, 1, 8]
[0, 3, 4, 7, 6]
5
Czochralski
[2, 1, 4]
[0, 3, 5, 7, 9, 8]
6
CSTR


STEP4: Assign D_process variable-variable definition pairs to J-type Templates

In [None]:
J = 300  #Specify number of templates, J = 20, 100, 300

In [20]:

# Load the templates file
with open("/content/drive/MyDrive/definition_extraction/dataset/templates_"+J+".json", 'r') as f:
    templates = json.load(f)

combined_train_val_dataset = process_dataset_train + process_dataset_val

# Store variable-variable definition pairs in the training data of D_process
process_pairs_with_def = []

for paper in combined_train_val_dataset:
    for entity in paper['entity']:
        symbol_text = entity['SYMBOL']['text'] if 'SYMBOL' in entity else None
        primary_text = entity['PRIMARY'][0]['text'] if 'PRIMARY' in entity and entity['PRIMARY'] and entity['PRIMARY'][0] else None

        if primary_text:
            process_pairs_with_def.append({'dtrm_expr': symbol_text, 'dsc_txt': primary_text})


# Create a list of unused dtrm_expr and dsc_txt
unused_definitions = [{"dtrm_expr": item["dtrm_expr"], "dsc_txt": item["dsc_txt"]} for item in process_pairs_with_def]

# Initialize template indexes
template_index = 0

# List to store the generated sentences and the associated variables and definition pairs
generated_sentence = []

# 1. Prepare a list to store dtrm_expr and dsc_txt that are not yet used
unused_dtrm_expr = [item["dtrm_expr"] for item in process_pairs_with_def]
unused_dsc_txt = [item["dsc_txt"] for item in process_pairs_with_def]

# 2. Select from the templates in sequence to generate the text
while unused_dtrm_expr:
    template = templates[template_index]
    primary_placeholders = [word for word in template.split() if word.startswith("[primary_")]
    symbol_placeholders = [word for word in template.split() if word.startswith("[symbol_")]

    # 3. If there is more than one "[symbol_", assign enough unused dtrm_expr (variable symbol) different from each other to fill all "[symbol_"
    symbol_replacements = {}
    for i, placeholder in enumerate(symbol_placeholders):
        if unused_dtrm_expr:
            replacement = unused_dtrm_expr.pop(0)
            symbol_replacements[f"[symbol_{i + 1}]"] = replacement

    # 4. Check the "[primary_]" in that template
    primary_replacements = {}
    for i, placeholder in enumerate(primary_placeholders):
        if "[primary_" in template:
            # If "[primary_" is present, assign unused dsc_txt
            if unused_dsc_txt:
                replacement = unused_dsc_txt.pop(0)
                primary_replacements[f"[primary_{i + 1}]"] = replacement

    for key, value in symbol_replacements.items():
        template = template.replace(key, value)
    for key, value in primary_replacements.items():
        template = template.replace(key, value)

    # 5. Add the generated statements and data to the list
    generated_sentence.append({
        "text": template,
        "primary": primary_replacements,
        "symbol": symbol_replacements
    })

    # 6. Update the index of templates and return to the beginning when used up
    template_index = (template_index + 1) % len(templates)

generated_data = []

for item in generated_sentence:
    output_item = {
        "text": item["text"],
        "primary": {key.strip(","): value for key, value in item["primary"].items()},
        "symbol": {key.strip(","): value for key, value in item["symbol"].items()}
    }
    generated_data.append(output_item)

In [21]:
labeled_data = []

# Process for each data element
for item in generated_data:
    symbol_data = item["symbol"]
    primary_data = item.get("primary", {})  # Set default value to an empty dictionary in case PRIMARY does not exist

    # Initialize list with symbols and primary labels
    entity_data = []

    # Set symbol data
    symbol_entities = []
    for symbol_key, symbol_value in symbol_data.items():
        pattern = r'(?<=\s)' + re.escape(symbol_value)  # スペースの後にマッチ
        match = re.search(pattern, item["text"])
        if match:
            start_index = match.start()
            end_index = match.end()
            symbol_entity = {
                "label": "SYMBOL",
                "start": start_index,
                "end": end_index,
                "text": symbol_value
            }
            symbol_entities.append({"SYMBOL": symbol_entity, "PRIMARY": []})

    # Set primary data
    if primary_data:
        primary_values = list(primary_data.values())
        for i, primary_value in enumerate(primary_values):
            pattern = r'(?<=\s)' + re.escape(primary_value)  # スペースの後にマッチ
            match = re.search(pattern, item["text"])
            if match:
                start_index = match.start()
                end_index = match.end()
                primary_entity = {
                    "label": "PRIMARY",
                    "start": start_index,
                    "end": end_index,
                    "text": primary_value
                }
                if i < len(symbol_entities):
                    symbol_entities[i]["PRIMARY"].append(primary_entity)

    # Create a new data structure and add "entity"
    new_item = {
        "text": item["text"],
        "entity": symbol_entities
    }

    labeled_data.append(new_item)

from sklearn.model_selection import train_test_split

# Split data into train, val and test
temp_train_data, temp_val_data = train_test_split(labeled_data, test_size=0.25, random_state=42)

In [22]:
dataset_train_for_loader = create_dataset(
    tokenizer, temp_train_data, max_length
)
dataset_val_for_loader = create_dataset(
    tokenizer, temp_val_data, max_length
)

# Create data loaders
temp_dataloader_train = DataLoader(
    dataset_train_for_loader, batch_size=8, shuffle=True,num_workers=8
)
temp_dataloader_val = DataLoader(dataset_val_for_loader, batch_size=1,num_workers=8)

STEP5: fine-tune BERT model

In [23]:
MODEL_PATH = os.path.join(OUTPUT_DIR, "symlink_only.bin")

MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered/symlink_to_Pro_random"+str(seed)+"_"+J+"_no_ordered.bin")
#MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered_exclude/symlink_to_Pro_"+J+"_no_oredered_without_BD_random"+str(seed)+".bin")
#MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_withdef/symlink_to_Pro_random"+str(seed)+"_"+J+"_withdef.bin")

In [24]:
# Load the model
if DO_2ND_FINETUNING:
  state_dict = torch.load(MODEL_PATH)
  if hasattr(model, "module"):
    model.module.load_state_dict(state_dict)
  else:
    model.load_state_dict(state_dict)
  model.to("cuda")
  from torch.optim import Adam

  optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [25]:
if DO_2ND_FINETUNING:
  fine_tuning(temp_dataloader_train,temp_dataloader_val)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/55 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Epoch 1 : Dev loss tensor(0.0026)


Training:   0%|          | 0/55 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Epoch 2 : Dev loss tensor(0.0020)


Training:   0%|          | 0/55 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Epoch 3 : Dev loss tensor(0.0009)


In [28]:
#no exclude
MODEL_PATH = os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered/symlink_to_Pro_random"+str(seed)+"_"+J+"_no_ordered.bin")
MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered/3step_Pro_random"+str(seed)+"_"+J+"_no_ordered.bin")

# exclude
#MODEL_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered_exclude/symlink_to_Pro_"+J+"_no_oredered_without_BD_random"+str(seed)+".bin")
#MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_no_ordered_exclude/3step_Pro_"+J+"_no_oredered_without_BD_random"+str(seed)+".bin")

# only withdef
#MODEL_PATH = os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_withdef/symlink_to_Pro_random"+str(seed)+"_"+J+"_withdef.bin")
#MODEL_OUTPUT_PATH=os.path.join(OUTPUT_DIR, "3step_Pro_"+J+"_withdef/3step_Pro_random"+str(seed)+"_"+J+"_withdef.bin")

In [29]:
# Load the model
if DO_3RD_FINETUNING:
  state_dict = torch.load(MODEL_PATH)
  if hasattr(model, "module"):
    model.module.load_state_dict(state_dict)
  else:
    model.load_state_dict(state_dict)
  model.to("cuda")
  from torch.optim import Adam
  #from torch.optim.lr_scheduler import LinearLR

  optimizer = Adam(model.parameters(), lr=LEARNING_RATE) # Pass model.parameters() for updating model parameters

In [30]:
if DO_3RD_FINETUNING:
  fine_tuning(process_dataloader_train,process_dataloader_val)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/94 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 1 : Dev loss tensor(0.0100)


Training:   0%|          | 0/94 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 2 : Dev loss tensor(0.0012)


Training:   0%|          | 0/94 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/114 [00:00<?, ?it/s]

Epoch 3 : Dev loss tensor(0.0007)


STEP6: Load the fine-tuned model and perform definition extraction

In [31]:
import numpy as np
def my_index2(l, x, default=0):
    return l.index(x) if x in l else default

def softmax(position_candidate):
    score=np.array([candi[0] for candi in position_candidate])
    exp_score = np.exp(score) # 分子
    sum_exp_score = np.sum(exp_score) # 分母
    soft_max_score = exp_score / sum_exp_score
    for i,candi in enumerate(position_candidate):
      candi[0]=soft_max_score[i]
    return position_candidate
def predict(encoding):
    """
      Function for Named Entity Recognition in BERT.
    """
    # Encode
    # encoding, start_positions,end_positions = tokenizer.encode_for_predict(
    #     text,primary,symbol,max_length, return_tensors='pt'
    # )
    # encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    start_positions=encoding["start_positions"]
    end_positions=encoding["end_positions"]
    pos_target_token=my_index2(encoding["input_ids"].tolist()[0],2)
    del encoding["start_positions"],encoding["end_positions"]
    encoding = { k: v.cuda() for k, v in encoding.items() }


    # Calculate the predictive value of the label
    with torch.no_grad():
        output = bert_de(**encoding)
        start_logits=output.start_logits[0].cpu().view(-1)
        end_logits=output.end_logits[0].cpu().view(-1)

        # start_logits=torch.softmax(start_logits, dim=-1)
        # end_logits=torch.softmax(end_logits, dim=-1)



        # #1
        # start_position_predicted = start_logits[0].cpu().argmax()
        # end_position_predicted = end_logits[0].cpu().argmax()
        #2
        # start_position_predicted = start_logits[0].cpu().argmax()
        # end_position_predicted = end_logits[0].cpu()[start_position_predicted:].argmax()+start_position_predicted

        #3
        start_position_candidate = torch.topk(start_logits,5)
        end_position_candidate = torch.topk(end_logits,5)
        # position_candidate=[Score,start_position,end_position]

        position_candidate=[[start_position_candidate[0][i]+end_position_candidate[0][j],start_position_candidate[1][i],end_position_candidate[1][j]] for i in range(5) for j in range(5)]

        position_candidate=[[float(candidate[0]),int(candidate[1]),int(candidate[2])] for candidate in position_candidate ]
        cls_candidate=[float(start_logits[0]+end_logits[0]),0,0]
        if cls_candidate not in position_candidate:
          position_candidate.append(cls_candidate)

        # Candidate if end is after start
        position_candidate=[candidate if candidate[1]<=candidate[2] else[-100,candidate[1],candidate[2]] for candidate in position_candidate ]
        # except when only either start or end points to [CLS]
        position_candidate=[candidate if (candidate[1]==0 and candidate[2]==0) or(candidate[1]!=0 and candidate[2]!=0) else[-100,candidate[1],candidate[2]] for candidate in position_candidate ]
        # except when start and end are too far apart
        position_candidate=[candidate for candidate in position_candidate if candidate[2]-candidate[1]<30]
        # except if target_token is included in the definition
        position_candidate=[candidate if not(candidate[1]<=pos_target_token and pos_target_token<=candidate[2]) else[-100,candidate[1],candidate[2]] for candidate in position_candidate ]

        position_candidate.sort(reverse=True)
        # x_1=position_candidate[0][0]
        # x_2=position_candidate[1][0]
        # x_3=position_candidate[2][0]
        # x=[x_1,x_2,x_3]
        # # position_candidate=softmax(position_candidate)
        # for candidate in position_candidate:
        #   if candidate[1]==0 and candidate[2]==0:
        #     cls_score=candidate[0]



        start_position_predicted,end_position_predicted=position_candidate[0][1:]

    start_positions_lst=start_positions.tolist()[0]
    start= [i for i, x in enumerate(start_positions_lst) if x == 1]
    end_positions_lst=end_positions.tolist()[0]
    end= [i for i, x in enumerate(end_positions_lst) if x == 1]

    positions_predicted=[start_position_predicted,end_position_predicted]
    positions=[[start[i],end[i]] for i in range(len(start))]


    return positions_predicted,positions,position_candidate[:4]#,cls_score,x

In [32]:
# Load the three- steps fine-tuned model and place it on the GPU
state_dict = torch.load(MODEL_OUTPUT_PATH)
if hasattr(model, "module"):
  model.module.load_state_dict(state_dict)
else:
  model.load_state_dict(state_dict)
bert_de = model.cuda()

In [33]:
import re
def evaluate(mixed_matrix,record=False,process_name=None):
  accuracy=(mixed_matrix["TP"]+mixed_matrix["TN"])/(mixed_matrix["TP"]+mixed_matrix["TN"]+mixed_matrix["FP"]+mixed_matrix["FN"]+mixed_matrix["TP_error"])
  precision=(mixed_matrix["TP"])/(mixed_matrix["TP"]+mixed_matrix["FP"]+mixed_matrix["TP_error"])
  recall=(mixed_matrix["TP"])/(mixed_matrix["TP"]+mixed_matrix["FN"]+mixed_matrix["TP_error"])
  F1score=(2*precision*recall)/(precision+recall)
  recall_without_def=(mixed_matrix["TN"])/(mixed_matrix["FP"]+mixed_matrix["TN"])

  accuracy_partial=(mixed_matrix["TP_partial_match"]+mixed_matrix["TN"])/(mixed_matrix["TP"]+mixed_matrix["TN"]+mixed_matrix["FP"]+mixed_matrix["FN"]+mixed_matrix["TP_error"])
  precision_partial=(mixed_matrix["TP_partial_match"])/(mixed_matrix["TP"]+mixed_matrix["FP"]+mixed_matrix["TP_error"])
  recall_partial=(mixed_matrix["TP_partial_match"])/(mixed_matrix["TP"]+mixed_matrix["FN"]+mixed_matrix["TP_error"])
  F1score_partial=(2*precision_partial*recall_partial)/(precision_partial+recall_partial)


  print_(accuracy)
  print_(precision)
  print_(recall)
  print_(F1score)
  print_(recall_without_def)
  print("\n")
  print_(accuracy_partial)
  print_(precision_partial)
  print_(recall_partial)
  print_(F1score_partial)
  print("\n")
  if record==True:
    accuracy_lst.append(accuracy)
    precision_lst.append(precision)
    recall_lst.append(recall)
    f1_lst.append(F1score)
    accuracy_partial_lst.append(accuracy_partial)
    precision_partial_lst.append(precision_partial)
    recall_partial_lst.append(recall_partial)
    f1_partial_lst.append(F1score_partial)
  if process_name:
    accuracy_each_process[process_name].append(accuracy)
    recall_each_process[process_name].append(recall)
    precision_each_process[process_name].append(precision)
    F1_each_process[process_name].append(F1score)
  return accuracy,precision,recall,F1score,recall_without_def


# Named Entity Recognition
# Note: Although in the following the data is processed one at a time for code clarity,
# Processing time would be shorter if it were done in batches.


max_length=512
test_results=[]

mixed_matrix_sum={"TP":0,"TP_error":0,"FP":0,"TN":0,"FN":0,"TP_partial_match":0}

for process_name,dataset in process_dataloader_test.items():

  print(process_name)
  mixed_matrix={"TP":0,"TP_error":0,"FP":0,"TN":0,"FN":0,"TP_partial_match":0,"TN_candidates":0,"TP_candidates":0}

  for data_example in dataset:

      tokens=tokenizer.convert_ids_to_tokens(data_example["input_ids"][0])
      # Input to the model
      labels_predicted,labels,candidates = predict(data_example)#(text,primary,symbol,max_length, tokenizer, bert_de) # BERTで予測

      result={}
      result["predict_position"]=labels_predicted
      result["answer_position"]=labels
      result["text"]=" ".join(tokens).replace(" ##","").replace(" [PAD]","").replace(target_token,"[target_token]").replace(" ' ","'").replace("\u0120","")

      result["answer"]=[" ".join(tokens[labels[i][0]:labels[i][1]+1]).replace(" ##","").replace("\u0120","") for i in range(len(labels))]
      result["predict"]=" ".join(tokens[labels_predicted[0]:labels_predicted[1]+1]).replace(" ##","").replace(" ' ","'").replace("\u0120","")
      if result["predict"]=="":
        result["predict"]="only start token is extracted:"+tokens[labels_predicted[0]]
      result["candidates"]=[{"score":candi[0],"text":" ".join(tokens[candi[1]:candi[2]+1]).replace(" ##","").replace(" ' ","'").replace("\u0120",""),"position":[candi[1],candi[2]]}for candi in candidates]
      #result["cls_score"]=cls_score



      # If there is no definition(FP,TN)
      if labels[0][0]==0:
        result["has_definition"]=0
        result["partial_match"]=None
        if labels_predicted in labels:
          mixed_matrix["TN"]+=1
          result["is_matched"]=1
        else:
          mixed_matrix["FP"]+=1
          result["is_matched"]=0

      #If there is a definition(TP,TP_error,FN)
      else:
        result["has_definition"]=1
        if labels_predicted in labels:
          mixed_matrix["TP"]+=1
          result["is_matched"]=1
        elif labels_predicted[0]==0 and labels_predicted[1]==0:
          mixed_matrix["FN"]+=1
          result["is_matched"]=0
        else :
          mixed_matrix["TP_error"]+=1
          result["is_matched"]=0

        if (labels[0][0]-labels_predicted[1])*(labels[0][1]-labels_predicted[0])<=0:
          mixed_matrix["TP_partial_match"]+=1
          result["partial_match"]=1

        else:
          result["partial_match"]=0
          if labels_predicted[0]==0 and labels_predicted[1]==0:
            result["partial_match"]=None

      test_results.append(result)

  accuracy,precision,recall,F1score,recall_without_def=evaluate(mixed_matrix,process_name=process_name)
  for matrix_component in mixed_matrix_sum.keys():
    mixed_matrix_sum[matrix_component]+=mixed_matrix[matrix_component]

print("ALL")
accuracy,precision,recall,F1score,recall_without_def=evaluate(mixed_matrix_sum,record=False)
measurement={"accuracy":0,"precision":0,"recall":0,"accuracy_without_def":0}
measurement["accuracy"]=accuracy
measurement["precision"]=precision
measurement["recall"]=recall
measurement["accuracy_without_def"]=recall_without_def


Biodiesel
accuracy = 0.9285714285714286
precision = 0.9166666666666666
recall = 0.9166666666666666
F1score = 0.9166666666666666
recall_without_def = 1.0


accuracy_partial = 1.0
precision_partial = 1.0
recall_partial = 1.0
F1score_partial = 1.0


crystallization
accuracy = 0.7662337662337663
precision = 0.8809523809523809
recall = 0.6851851851851852
F1score = 0.7708333333333335
recall_without_def = 0.9565217391304348


accuracy_partial = 0.8051948051948052
precision_partial = 0.9523809523809523
recall_partial = 0.7407407407407407
F1score_partial = 0.8333333333333334


shell_and_tube_exchanger
accuracy = 0.7837837837837838
precision = 0.9655172413793104
recall = 0.6511627906976745
F1score = 0.7777777777777779
recall_without_def = 0.967741935483871


accuracy_partial = 0.7837837837837838
precision_partial = 0.9655172413793104
recall_partial = 0.6511627906976745
F1score_partial = 0.7777777777777779


Czochralski
accuracy = 0.86
precision = 0.9454545454545454
recall = 0.8125
F1score = 0.87

mixed_matrix["TP"] = [0, 0, 0, 3, 10, 8, 12, 18, 25, 359]

mixed_matrix["FP"] = [0, 0, 0, 4, 3, 10, 15, 10, 16, 92]

mixed_matrix["TN"] = [0, 0, 0, 0, 1, 5, 8, 6, 6, 586]

mixed_matrix["FN"] = [0, 0, 0, 1, 1, 1, 6, 2, 3, 128]

mixed_matrix["TN_candidates"] = [0, 0, 0, 4, 1, 8, 6, 8, 11, 49]

mixed_matrix["TP_candidates"] = [0, 0, 0, 7, 7, 11, 14, 14, 13, 100]

In [None]:
save=[]
errors_for_save=[result for result in test_results]#if result["is_matched"]==0]
save.append(errors_for_save)

mismatched_entries = []
for entry in save:
    for item in entry:
        is_matched = item["is_matched"]

        # Add mismatched entries to the list
        if  is_matched == 0:
            mismatched_entries.append(item)

import datetime
date=str(datetime.datetime.now()).replace(" ","_").replace(":","").replace(".","")
import json
with open("/content/drive/MyDrive/definition_extraction/result/error_examples/TPL_"+J+"_error"+date+"_random"+str(seed)+".json", 'w') as f:
    json.dump(mismatched_entries, f, indent=4)

print({len(mismatched_entries)})
