<a href="https://colab.research.google.com/github/horasan/eng_to_sql_ner/blob/main/NER_A_3_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
#!pip install transformers
#!pip install datasets
#!pip install seqeval

In [12]:
from google.colab import drive
# read data from google drive
drive.mount('/content/drive')
FOLDER_PATH = "NER_for_SQL"
FULL_PATH = "/content/drive/My Drive/Colab Notebooks/" + FOLDER_PATH + "/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
FOLDER_PATH = "NER_for_SQL"
FULL_PATH = "/content/drive/My Drive/Colab Notebooks/" + FOLDER_PATH + "/"
bio_tagged_dataset_file_name   = "synthetic_queries_300_bio_tagged.txt"

tag2id_with_cust_file_name = "tag2id_with_cust.json"
id2tag_with_cust_file_name = "id2tag_with_cust.json"

trained_model_path = FULL_PATH + "ner-roberta-with-cust"
trained_tokenizer_path = FULL_PATH + "ner-roberta-with-cust"

# utils

In [14]:
def load_bio_tagged_data(filepath):
    """
    Reads a BIO-tagged file and returns a list of (tokens, tags) tuples per sentence.

    Args:
        filepath (str): Path to the BIO-tagged data file.

    Returns:
        List[Tuple[List[str], List[str]]]: List of sentences with tokens and tags.
    """
    sentences = []
    tokens = []
    tags = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()

            # Empty line = end of sentence
            if not line:
                if tokens:
                    sentences.append((tokens, tags))
                    tokens = []
                    tags = []
            else:
                # Split line into token and tag
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    tokens.append(token)
                    tags.append(tag)
                else:
                    raise ValueError(f"Malformed line: {line}")

        # Catch any remaining sentence at EOF
        if tokens:
            sentences.append((tokens, tags))

    return sentences


In [15]:
import random

def split_train_test(data, test_size=0.2, seed=42):
    """
    Splits BIO-tagged (tokens, tags) data into train and test sets.

    Args:
        data (list of tuples): Each item is a (tokens, tags) pair.
        test_size (float): Proportion of data to include in the test set.
        seed (int): Random seed for reproducibility.

    Returns:
        Tuple: (train_data, test_data)
    """
    random.seed(seed)
    random.shuffle(data)

    split_index = int(len(data) * (1 - test_size))
    train_data = data[:split_index]
    test_data = data[split_index:]

    return train_data, test_data


In [16]:
def prepare_dataset_for_tokenization(data):
    """
    Converts a list of (tokens, tags) tuples to a dict of lists.
    """
    return [{"tokens": tokens, "tags": tags} for tokens, tags in data]

In [17]:
def build_tag2id_from_data(data):
    """
    Builds a tag2id and id2tag mapping from a dataset with 'tags' field.

    Args:
        data (list): A list of dicts, each with a 'tags' key.

    Returns:
        tag2id (dict): Mapping from tag to unique ID.
        id2tag (dict): Reverse mapping from ID to tag.
    """
    unique_tags = set(tag for sample in data for tag in sample["tags"])
    sorted_tags = sorted(unique_tags)  # Optional: sort for consistency
    tag2id = {tag: idx for idx, tag in enumerate(sorted_tags)}
    id2tag = {idx: tag for tag, idx in tag2id.items()}
    return tag2id, id2tag

In [18]:
from transformers import RobertaTokenizerFast

def tokenize_and_align_labels(examples, tokenizer, tag2id, max_length=128):
    tokenized_inputs = []
    labels = []

    for item in examples:
        tokens = item['tokens']
        tags = item['tags']

        # Use the tokenizer to get subwords and mapping
        tokenized = tokenizer(tokens,
                              is_split_into_words=True,
                              truncation=True,
                              padding='max_length',
                              max_length=max_length,
                              return_offsets_mapping=True)

        word_ids = tokenized.word_ids()
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored in loss computation
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[tags[word_idx]])
            else:
                # For subwords, use I- tag if it was B-, or repeat O
                prev_label = tags[word_idx]
                if prev_label.startswith("B-"):
                    i_label = "I-" + prev_label[2:]
                    if i_label in tag2id:
                        label_ids.append(tag2id[i_label])
                    else:
                        label_ids.append(tag2id[prev_label])
                else:
                    label_ids.append(tag2id[prev_label])

            previous_word_idx = word_idx

        tokenized["labels"] = label_ids
        tokenized_inputs.append(tokenized)

    return tokenized_inputs

In [19]:
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2tag[pred] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[label] for (pred, label) in zip(prediction, label) if label != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in true_predictions for item in sublist],
        zero_division=0,
        output_dict=True
    )
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
    }


In [20]:
from transformers import RobertaForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report, accuracy_score, f1_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2tag[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    true_preds = [
        [id2tag[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "report": classification_report(true_labels, true_preds, digits=4),
    }

# Prepare for traning

In [21]:
#bio_tagged_dataset_file_name = "synthetic_queries_300_bio_tagged.txt"
bio_tagged_dataset = load_bio_tagged_data(FULL_PATH + bio_tagged_dataset_file_name)

# Inspect the first sentence
print(bio_tagged_dataset[0])

(['Fetch', 'all', 'FX', 'trades', 'with', 'the', 'date', 'today'], ['O', 'O', 'B-DEAL_TYPE', 'O', 'O', 'O', 'O', 'B-VALUE_DATE'])


In [22]:
processed_bio_data_sen_bio_data = prepare_dataset_for_tokenization(bio_tagged_dataset)

In [23]:
train_data, test_data = split_train_test(processed_bio_data_sen_bio_data, test_size=0.005)

In [24]:
print(f"Training examples: {len(train_data)}")
print(f"Testing examples: {len(test_data)}")

Training examples: 2985
Testing examples: 15


In [25]:
tag2id, id2tag = build_tag2id_from_data(processed_bio_data_sen_bio_data)

In [26]:
import json

#tag2id_with_cust_file_name = "tag2id_with_cust.json"
#id2tag_with_cust_file_name = "id2tag_with_cust.json"

# Save mappings
with open(FULL_PATH + tag2id_with_cust_file_name, "w") as f:
    json.dump(tag2id, f)

id2tag_to_save = {int(k): v for k, v in id2tag.items()}
with open(FULL_PATH + id2tag_with_cust_file_name, "w") as f:
    json.dump(id2tag_to_save, f)


In [27]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
train_tokenized = tokenize_and_align_labels(train_data, tokenizer, tag2id)
test_tokenized = tokenize_and_align_labels(test_data, tokenizer, tag2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Model development

In [28]:
from datasets import Dataset

train_dataset = Dataset.from_list(train_tokenized)
test_dataset = Dataset.from_list(test_tokenized)

In [29]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [30]:
from transformers import RobertaForTokenClassification

num_labels = len(tag2id)
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=num_labels)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner-roberta-with-cust",
    #evaluation_strategy="epoch",
    eval_strategy = "epoch",
    save_strategy="epoch",
    logging_dir="./logs-ner-roberta-with-cust",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)


In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Report
1,No log,0.010651,1.0,1.0,precision recall f1-score support  AMOUNT 1.0000 1.0000 1.0000 8  AMOUNT_HIGH 1.0000 1.0000 1.0000 2  AMOUNT_LOW 1.0000 1.0000 1.0000 2  APPROVE_DATE 1.0000 1.0000 1.0000 8  BUY_SELL_CODE 1.0000 1.0000 1.0000 3 COUNTERPARTY_NAME 1.0000 1.0000 1.0000 4  COUNTRY_CODE 1.0000 1.0000 1.0000 1  CREATION_DATE 1.0000 1.0000 1.0000 2  CURRENCY 1.0000 1.0000 1.0000 5  CUSTOMER_NAME 1.0000 1.0000 1.0000 3  DEALER_NAME 1.0000 1.0000 1.0000 2  DEAL_DATE 1.0000 1.0000 1.0000 2  DEAL_TYPE 1.0000 1.0000 1.0000 8  EMAIL_ADDRESS 1.0000 1.0000 1.0000 10  END_DATE 1.0000 1.0000 1.0000 1  PHONE_NUMBER 1.0000 1.0000 1.0000 1  PORTFOLIO_NAME 1.0000 1.0000 1.0000 1  PROFIT_CENTER 1.0000 1.0000 1.0000 1  START_DATE 1.0000 1.0000 1.0000 1  VALUE_DATE 1.0000 1.0000 1.0000 3  micro avg 1.0000 1.0000 1.0000 68  macro avg 1.0000 1.0000 1.0000 68  weighted avg 1.0000 1.0000 1.0000 68
2,No log,0.001688,1.0,1.0,precision recall f1-score support  AMOUNT 1.0000 1.0000 1.0000 8  AMOUNT_HIGH 1.0000 1.0000 1.0000 2  AMOUNT_LOW 1.0000 1.0000 1.0000 2  APPROVE_DATE 1.0000 1.0000 1.0000 8  BUY_SELL_CODE 1.0000 1.0000 1.0000 3 COUNTERPARTY_NAME 1.0000 1.0000 1.0000 4  COUNTRY_CODE 1.0000 1.0000 1.0000 1  CREATION_DATE 1.0000 1.0000 1.0000 2  CURRENCY 1.0000 1.0000 1.0000 5  CUSTOMER_NAME 1.0000 1.0000 1.0000 3  DEALER_NAME 1.0000 1.0000 1.0000 2  DEAL_DATE 1.0000 1.0000 1.0000 2  DEAL_TYPE 1.0000 1.0000 1.0000 8  EMAIL_ADDRESS 1.0000 1.0000 1.0000 10  END_DATE 1.0000 1.0000 1.0000 1  PHONE_NUMBER 1.0000 1.0000 1.0000 1  PORTFOLIO_NAME 1.0000 1.0000 1.0000 1  PROFIT_CENTER 1.0000 1.0000 1.0000 1  START_DATE 1.0000 1.0000 1.0000 1  VALUE_DATE 1.0000 1.0000 1.0000 3  micro avg 1.0000 1.0000 1.0000 68  macro avg 1.0000 1.0000 1.0000 68  weighted avg 1.0000 1.0000 1.0000 68
3,0.262100,0.000924,1.0,1.0,precision recall f1-score support  AMOUNT 1.0000 1.0000 1.0000 8  AMOUNT_HIGH 1.0000 1.0000 1.0000 2  AMOUNT_LOW 1.0000 1.0000 1.0000 2  APPROVE_DATE 1.0000 1.0000 1.0000 8  BUY_SELL_CODE 1.0000 1.0000 1.0000 3 COUNTERPARTY_NAME 1.0000 1.0000 1.0000 4  COUNTRY_CODE 1.0000 1.0000 1.0000 1  CREATION_DATE 1.0000 1.0000 1.0000 2  CURRENCY 1.0000 1.0000 1.0000 5  CUSTOMER_NAME 1.0000 1.0000 1.0000 3  DEALER_NAME 1.0000 1.0000 1.0000 2  DEAL_DATE 1.0000 1.0000 1.0000 2  DEAL_TYPE 1.0000 1.0000 1.0000 8  EMAIL_ADDRESS 1.0000 1.0000 1.0000 10  END_DATE 1.0000 1.0000 1.0000 1  PHONE_NUMBER 1.0000 1.0000 1.0000 1  PORTFOLIO_NAME 1.0000 1.0000 1.0000 1  PROFIT_CENTER 1.0000 1.0000 1.0000 1  START_DATE 1.0000 1.0000 1.0000 1  VALUE_DATE 1.0000 1.0000 1.0000 3  micro avg 1.0000 1.0000 1.0000 68  macro avg 1.0000 1.0000 1.0000 68  weighted avg 1.0000 1.0000 1.0000 68
4,0.262100,0.000719,1.0,1.0,precision recall f1-score support  AMOUNT 1.0000 1.0000 1.0000 8  AMOUNT_HIGH 1.0000 1.0000 1.0000 2  AMOUNT_LOW 1.0000 1.0000 1.0000 2  APPROVE_DATE 1.0000 1.0000 1.0000 8  BUY_SELL_CODE 1.0000 1.0000 1.0000 3 COUNTERPARTY_NAME 1.0000 1.0000 1.0000 4  COUNTRY_CODE 1.0000 1.0000 1.0000 1  CREATION_DATE 1.0000 1.0000 1.0000 2  CURRENCY 1.0000 1.0000 1.0000 5  CUSTOMER_NAME 1.0000 1.0000 1.0000 3  DEALER_NAME 1.0000 1.0000 1.0000 2  DEAL_DATE 1.0000 1.0000 1.0000 2  DEAL_TYPE 1.0000 1.0000 1.0000 8  EMAIL_ADDRESS 1.0000 1.0000 1.0000 10  END_DATE 1.0000 1.0000 1.0000 1  PHONE_NUMBER 1.0000 1.0000 1.0000 1  PORTFOLIO_NAME 1.0000 1.0000 1.0000 1  PROFIT_CENTER 1.0000 1.0000 1.0000 1  START_DATE 1.0000 1.0000 1.0000 1  VALUE_DATE 1.0000 1.0000 1.0000 3  micro avg 1.0000 1.0000 1.0000 68  macro avg 1.0000 1.0000 1.0000 68  weighted avg 1.0000 1.0000 1.0000 68
5,0.262100,0.000655,1.0,1.0,precision recall f1-score support  AMOUNT 1.0000 1.0000 1.0000 8  AMOUNT_HIGH 1.0000 1.0000 1.0000 2  AMOUNT_LOW 1.0000 1.0000 1.0000 2  APPROVE_DATE 1.0000 1.0000 1.0000 8  BUY_SELL_CODE 1.0000 1.0000 1.0000 3 COUNTERPARTY_NAME 1.0000 1.0000 1.0000 4  COUNTRY_CODE 1.0000 1.0000 1.0000 1  CREATION_DATE 1.0000 1.0000 1.0000 2  CURRENCY 1.0000 1.0000 1.0000 5  CUSTOMER_NAME 1.0000 1.0000 1.0000 3  DEALER_NAME 1.0000 1.0000 1.0000 2  DEAL_DATE 1.0000 1.0000 1.0000 2  DEAL_TYPE 1.0000 1.0000 1.0000 8  EMAIL_ADDRESS 1.0000 1.0000 1.0000 10  END_DATE 1.0000 1.0000 1.0000 1  PHONE_NUMBER 1.0000 1.0000 1.0000 1  PORTFOLIO_NAME 1.0000 1.0000 1.0000 1  PROFIT_CENTER 1.0000 1.0000 1.0000 1  START_DATE 1.0000 1.0000 1.0000 1  VALUE_DATE 1.0000 1.0000 1.0000 3  micro avg 1.0000 1.0000 1.0000 68  macro avg 1.0000 1.0000 1.0000 68  weighted avg 1.0000 1.0000 1.0000 68


TrainOutput(global_step=935, training_loss=0.14142433105305555, metrics={'train_runtime': 383.4276, 'train_samples_per_second': 38.925, 'train_steps_per_second': 2.439, 'total_flos': 975254404896000.0, 'train_loss': 0.14142433105305555, 'epoch': 5.0})

In [36]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [38]:
# Save the fine-tuned model

trainer.save_model(trained_model_path)

# Save the tokenizer

tokenizer.save_pretrained(trained_tokenizer_path)

('/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/tokenizer_config.json',
 '/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/special_tokens_map.json',
 '/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/vocab.json',
 '/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/merges.txt',
 '/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/added_tokens.json',
 '/content/drive/My Drive/Colab Notebooks/NER_for_SQL/ner-roberta-with-cust/tokenizer.json')