<a href="https://colab.research.google.com/github/horasan/eng_to_sql_ner/blob/main/NER_A_4_Model_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import torch

In [1]:
from google.colab import drive
# read data from google drive
drive.mount('/content/drive')
FOLDER_PATH = "NER_for_SQL"
FULL_PATH = "/content/drive/My Drive/Colab Notebooks/" + FOLDER_PATH + "/"

Mounted at /content/drive


In [2]:

bio_tagged_dataset_file_name   = "synthetic_queries_300_bio_tagged.txt"

tag2id_with_cust_file_name = "tag2id_with_cust.json"
id2tag_with_cust_file_name = "id2tag_with_cust.json"

trained_model_path = FULL_PATH + "ner-roberta-with-cust"
trained_tokenizer_path = FULL_PATH + "ner-roberta-with-cust"

# utils

In [3]:
def predict(text, tokenizer, model, id2tag):
    # Tokenize input
    tokens = text.split()
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        return_offsets_mapping=True
    )

    # Remove offset_mapping before feeding to model
    encoding.pop("offset_mapping")

    # Run inference
    model.eval()
    with torch.no_grad():
        output = model(**encoding)
        predictions = torch.argmax(output.logits, dim=-1)

    # Get word-level predictions
    word_ids = encoding.word_ids()
    results = []
    for idx, word_idx in enumerate(word_ids):
        if word_idx is not None and (idx == 0 or word_idx != word_ids[idx - 1]):
            label_id = predictions[0][idx].item()
            #tag = id2tag.get(label_id, "O")  # default to "O" if not found
            results.append((tokens[word_idx], id2tag[label_id]))
            #results.append((tokens[word_idx], tag))
    return results


# Load saved model

In [None]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(trained_model_path)

# Log saved id-tag files

In [5]:
import json

#tag2id_with_cust_file_name = "tag2id_with_cust.json"
#id2tag_with_cust_file_name = "id2tag_with_cust.json"

# just to be sure type is integer
with open(FULL_PATH + id2tag_with_cust_file_name, "r") as f:
    id2tag = {int(k): v for k, v in json.load(f).items()}


# Capture the business entities

In [8]:
#text = "Get trades for Deutsche Bank with status cancelled"
#text = "Get trades for Deutsche Bank with status cancelled and value date is tomorrow"
#text = "Get trades for Deutsche Bank with status cancelled and value date is tomorrow and amount is 1000 EUR"
#text = "Get all the 3rd party and their phone number where settlement date is today"
text = "Get MM trades for ABC BANK with status approved and value date is tomorrow and amount is 3000 and cur is EUR"
result = predict(text, tokenizer, model, id2tag)

for word, tag in result:
    print(f"{word}\t{tag}")


Get	O
MM	B-DEAL_TYPE
trades	O
for	O
ABC	B-CUSTOMER_NAME
BANK	I-CUSTOMER_NAME
with	O
status	O
approved	B-STATUS
and	O
value	O
date	O
is	O
tomorrow	B-VALUE_DATE
and	O
amount	O
is	O
3000	B-AMOUNT
and	O
cur	O
is	O
EUR	B-CURRENCY
