# 📦 1. Install Dependencies
!pip install -r requirements.txt

# 📂 2. Load libraries

In [1]:
import json
import pandas as pd

# 📂 2. Load intents.json

In [2]:
# Json Example
# with open("../intents.json", "r") as f:
#     intents = json.load(f)
# data = []
# for intent, phrases in intents.items():
#     for phrase in phrases:
#         data.append({"text": phrase, "intent": intent})
# df = pd.DataFrame(data)

# CSV is simplers
df = pd.read_csv("training_data.csv")

df.sample(6)

Unnamed: 0,text,intent
15,catch you later,goodbye
37,what is cryptex?,others
17,so long,goodbye
14,adios,goodbye
41,what is the meaning of life?,others
33,I'm confused,others


# 🏷️ 3. Label Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["intent"])
num_labels = len(le.classes_)

print(num_labels)
print(le.__dict__) # this sequence is important


4
{'classes_': array(['goodbye', 'greet', 'others', 'thank_you'], dtype=object)}


# 🔠 4. Tokenize Text

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokens = tokenizer(
    list(df["text"]),
    truncation=True,
    padding=True,
    return_tensors="pt"
)

  from .autonotebook import tqdm as notebook_tqdm


# 🧱 5. Create Dataset

In [5]:
import torch

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = IntentDataset(tokens, df["label"].tolist())

# 🤖 6. Load BERT for Classification

In [6]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(le.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 🏋️ 7. Train the Model

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=20,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


model.save_pretrained("intent_model")
tokenizer.save_pretrained("intent_model")



Step,Training Loss
10,1.2561
20,1.0527
30,0.7868
40,0.5498
50,0.4035
60,0.2582
70,0.086
80,0.0324
90,0.0165
100,0.0111


('intent_model/tokenizer_config.json',
 'intent_model/special_tokens_map.json',
 'intent_model/vocab.txt',
 'intent_model/added_tokens.json')

# 💾 8. Save Model and Tokenizer

In [8]:
import joblib

tokenizer = BertTokenizer.from_pretrained("intent_model")
model = BertForSequenceClassification.from_pretrained("intent_model")


# Save Label Encoder
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [9]:
def predict_intent(text,  confidence_threshold=0.7):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    max_prob, predicted = torch.max(probabilities, dim=1)

     # If confidence is below threshold, return "others"
    if max_prob.item() < confidence_threshold:
        return "others"

    # predicted = torch.argmax(logits, dim=1)
    return le.inverse_transform(predicted.numpy())[0]

# Try it!
print(predict_intent("hiya!"))            # → greet
print(predict_intent("tell me a joke"))   # → joke


greet
others


In [10]:
print(predict_intent("what is tns audit store and how is it designed?"))   # → joke

print(predict_intent("hi there!"))


others
greet


In [11]:
from optimum.exporters.onnx import main_export
from pathlib import Path
import os

source_model_dir = "intent_model"

# Define the output directory for the ONNX model and associated files
onnx_output_dir = Path("src/main/resources/intent_model_onnx")
onnx_output_dir.mkdir(parents=True, exist_ok=True) # Ensure the directory exists


print(f"Attempting to export model from: {os.path.abspath(source_model_dir)}")
print(f"ONNX model and associated files will be saved in: {os.path.abspath(onnx_output_dir)}")



try:
    # Export the model.
    # For BertForSequenceClassification, the task is "text-classification".
    # The exporter will save 'model.onnx' and other files (tokenizer config, model config)
    # into the onnx_output_dir.
    main_export(
        model_name_or_path=source_model_dir,
        output=onnx_output_dir,  # Specify the directory for output
        task="text-classification",  # Standard task name for sequence classification models
        # opset=12,  # Optional: specify a specific ONNX opset version. Defaults to a stable one.
        # device="cpu", # Optional: specify device for export ('cpu' or 'cuda')
        # framework="pt", # Optional: can be 'pt' (PyTorch) or 'tf' (TensorFlow). Usually auto-detected.
    )
    print(f"Successfully exported ONNX model and associated files to: {onnx_output_dir}")
    print(f"The ONNX model file is: {onnx_output_dir / 'model.onnx'}")
    print(f"Tokenizer files (e.g., vocab.txt, tokenizer_config.json) should also be in this directory.")

except Exception as e:
    print(f"An error occurred during ONNX export: {e}")
    print(f"Please ensure that your '{source_model_dir}' directory contains all necessary files "
          f"(e.g., pytorch_model.bin, config.json, vocab.txt, tokenizer_config.json).")



Attempting to export model from: /Volumes/Workspace/code/ai/hello-classifier/intent_model
ONNX model and associated files will be saved in: /Volumes/Workspace/code/ai/hello-classifier/src/main/resources/intent_model_onnx
Successfully exported ONNX model and associated files to: src/main/resources/intent_model_onnx
The ONNX model file is: src/main/resources/intent_model_onnx/model.onnx
Tokenizer files (e.g., vocab.txt, tokenizer_config.json) should also be in this directory.


In [15]:
# Run and validate the onnx model
import onnxruntime as ort


model_path = "src/main/resources/intent_model_onnx"

 # Load ONNX model
onnx_session = ort.InferenceSession(model_path+"/model.onnx")
onnx_tokenizer = BertTokenizer.from_pretrained(model_path)

# 
id2label = ['goodbye', 'greet', 'others', 'thank_you']


def onnx_predict(text, confidence_threshold=0.7):
        
    # Tokenize input text
    inputs = onnx_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    # Convert to numpy arrays
    input_ids = inputs['input_ids'].numpy()
    attention_mask = inputs['attention_mask'].numpy()
    token_type_ids = inputs['token_type_ids'].numpy()  

    # Prepare input for ONNX model
    ort_inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids
    }

    # Run inference
    ort_outputs = onnx_session.run(None, ort_inputs)

    # Get probabilities
    logits = ort_outputs[0]
    probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=1)

    # Get prediction
    max_prob, predicted = torch.max(probabilities, dim=1)

    # Check confidence threshold
    if max_prob.item() < confidence_threshold:
        return "others"
    
    # Return predicted intent
    return id2label[predicted.item()]
    

In [16]:
 # Test some examples
test_texts = [
    "hi there!",
    "tell me a joke",
    "what is tns audit store and how is it designed?",
    "thank you very much"
]

for text in test_texts:
    intent = onnx_predict(text)
    print(f"Text: {text}")
    print(f"Predicted Intent: {intent}\n")

Text: hi there!
Predicted Intent: greet

Text: tell me a joke
Predicted Intent: others

Text: what is tns audit store and how is it designed?
Predicted Intent: others

Text: thank you very much
Predicted Intent: thank_you

