In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel
from TorchCRF import CRF



class BertCRFModel(nn.Module):
    def __init__(self, config: dict, intent_labels: list[str], slot_labels: list[str], dropout: float = 0.1):
        super(BertCRFModel, self).__init__()
        self.config = config
        self.intent_labels = intent_labels
        self.slot_labels = slot_labels
        self.dropout_rate = dropout
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained(self.config["model_dir"])
        
        # Dropout layer
        self.dropout = nn.Dropout(self.dropout_rate)
        
        # Intent classification layers
        self.intent_layer_1 = nn.Linear(self.bert.config.hidden_size, self.config["out_first_layer"])
        self.intent_activation_1 = nn.ReLU()
        self.intent_layer_2 = nn.Linear(self.config["out_first_layer"], len(self.intent_labels))
        
        # Slot filling layers
        self.slot_layer_1 = nn.Linear(self.bert.config.hidden_size, self.config["out_first_layer"])
        self.slot_activation_1 = nn.ReLU()
        self.slot_layer_2 = nn.Linear(self.config["out_first_layer"], len(self.slot_labels))
        
        # CRF layer for slot filling
        self.crf = CRF(len(self.slot_labels), batch_first = True)

    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        # BERT outputs
        bert_outputs = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # Sequence output for slot filling
        pooled_output = bert_outputs.pooler_output       # Pooled output for intent classification
        
        # Intent classification
        intent_output_dropout = self.dropout(pooled_output)
        intent_out_layer_1 = self.intent_layer_1(intent_output_dropout)
        intent_act_1 = self.intent_activation_1(intent_out_layer_1)
        intent_logits = self.intent_layer_2(intent_act_1)
        
        # Slot filling
        slot_output_dropout = self.dropout(sequence_output)
        slot_out_layer_1 = self.slot_layer_1(slot_output_dropout)
        slot_act_1 = self.slot_activation_1(slot_out_layer_1)
        slot_logits = self.slot_layer_2(slot_act_1)
        
        if labels is not None:
            # CRF loss
            loss = -self.crf(slot_logits, labels, mask=attention_mask.bool(), reduction='mean')
            return loss, intent_logits
        else:
            # CRF decoding
            slot_predictions = self.crf.decode(slot_logits, mask=attention_mask.bool())
            return slot_predictions, intent_logits

# Example usage
config = {
    "model_dir": "bert-base-uncased",
    "out_first_layer": 768
}
intent_labels = ["intent1", "intent2", "intent3"]
slot_labels = ["slot1", "slot2", "slot3", "slot4"]

model = BertCRFModel(config, intent_labels, slot_labels)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import torch

# Example input configuration
batch_size = 2
seq_length = 10  # Adjust this as needed
num_intent_labels = len(intent_labels)
num_slot_labels = len(slot_labels)

# Dummy input tensors
input_ids = torch.randint(0, 30522, (batch_size, seq_length))  # Random integers representing token ids
token_type_ids = torch.zeros((batch_size, seq_length), dtype=torch.long)  # All zeros (single sequence)
attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)  # All ones (no padding)

# Dummy labels for slot filling
labels = torch.randint(0, num_slot_labels, (batch_size, seq_length))

# Create the model instance
model = BertCRFModel(config, intent_labels, slot_labels)

# Put the model in evaluation mode (not strictly necessary for this test)
model.eval()

# Test the forward method with dummy inputs
with torch.no_grad():
    # Without labels (inference mode)
    slot_predictions, intent_logits = model(input_ids, token_type_ids, attention_mask)
    print("Slot Predictions (Inference):", slot_predictions)
    print("Intent Logits (Inference):", intent_logits)

    # With labels (training mode)
    loss, intent_logits = model(input_ids, token_type_ids, attention_mask, labels)
    print("Loss (Training):", loss)
    print("Intent Logits (Training):", intent_logits)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Slot Predictions (Inference): [[1, 1, 0, 0, 0, 0, 1, 0, 3, 3], [3, 1, 1, 1, 1, 1, 1, 0, 1, 1]]
Intent Logits (Inference): tensor([[-0.3224,  0.1625, -0.1054],
        [-0.2655,  0.1444, -0.1160]])
Loss (Training): tensor(13.9148)
Intent Logits (Training): tensor([[-0.3224,  0.1625, -0.1054],
        [-0.2655,  0.1444, -0.1160]])


In [1]:
# from bert_model_implementation_torch.model import BertModel
from bert_model_implementation_torch.tokenization import  _is_control,_is_whitespace,_is_punctuation
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import torch 
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm
from keras.src.utils import to_categorical
from torch.utils.data import DataLoader
import shutil

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
05/18/2024 19:41:52 - INFO - numexpr.utils -   Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
05/18/2024 19:41:52 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


In [9]:
data = pd.read_csv("../data/Book1.csv")

In [45]:
data.head()

Unnamed: 0,prompt,intent,classes
0,Can you update the sales figures in cells <Cel...,Update cell range,entry and manipulation
1,Please modify the data in range <Range>,Update cell range,entry and manipulation
2,I need to change the prices in cells <Range>,Update cell range,entry and manipulation
3,Could you update the inventory levels from <Ce...,Update cell range,entry and manipulation
4,Please adjust the budget numbers in range <Range>,Update cell range,entry and manipulation


In [46]:
config = {
    'max_len': 256,
    'batch_size': 8,
    'epochs': 10,
    'lr':1e-05,
    'out_first_layer': 768,
    'dropout_rate': 0.1,
    'model_dir':'bert-base-cased',
    'ckpt_path': './ckpts',
    'ckpt_model_path': './experiments'
}

In [10]:
intent_list = data.intent.unique().tolist()

In [48]:
class intent_dataset:
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer, max_len: int):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.classes = data.intent.unique().tolist()
        self.y = df['intent']
        self.x = df['prompt']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        title = str(self.x[index])
        title= ''.join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens = True,
            return_attention_mask = True,
            return_tensors = 'pt',
            return_token_type_ids = True,
            padding = 'max_length',
            max_length = self.max_len,
            truncation = True
        )
        # print(self.classes.index(self.y[index]))
        target = self.classes.index(self.y[index])  # Get the class index
        target_tensor = torch.zeros(len(self.classes),dtype= torch.float32)  # Initialize target tensor with zeros
        target_tensor[target] = 1  # Set the corresponding index to 1
        return {
            'input_ids': inputs["input_ids"].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': target_tensor
        }

In [49]:
train = data.sample(frac=0.9, random_state=200).reset_index(drop=True)
val = data.drop(train.index).reset_index(drop=True)

In [50]:
tokenizer = BertTokenizer.from_pretrained(config["model_dir"])
train_dataset = intent_dataset(train, tokenizer, config["max_len"])
val_dataset = intent_dataset(val, tokenizer, config["max_len"])

In [51]:
train_loader = DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = config["batch_size"],
    num_workers = 0
    )

val_loader = DataLoader(
    val_dataset,
    shuffle = False,
    batch_size = config["batch_size"],
    num_workers = 0
    )

In [52]:
train_loader.dataset[10]['targets']


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [53]:
len(train_loader.dataset[0]['input_ids'])


256

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [26]:
def load_ckpt(ckpt_path, model, optimizer):
    ckpt = torch.load(ckpt_path)
    # print(ckpt)
    model.load_state_dict(ckpt['state_dict'])
    # optimizer.load_state_dict(ckpt['state_dict'])
    valid_loss_min = ckpt['valid_loss_min']
    return model

In [3]:
def save_ckpt(state, is_best, ckpt_path, best_model_path):
    f_path= ckpt_path
    torch.save(state, f_path)
    if is_best:
        best_f_path = best_model_path
        shutil.copyfile(f_path, best_f_path)

In [7]:
class intent_model(nn.Module):
    def __init__(self, config: dict, intent_labels: list[str], dropout: float = 0.1):
        super(intent_model,self).__init__()
        self.config = config
        self.intent_labels = intent_labels
        self.dropout_rate = dropout
        self.bert = BertModel.from_pretrained(self.config["model_dir"])
        self.dropout = nn.Dropout(self.dropout_rate)
        self.layer_1 = nn.Linear(self.bert.config.hidden_size,self.config["out_first_layer"])
        self.activation_1 = nn.ReLU()
        self.layer_2 = nn.Linear(self.config["out_first_layer"], len(self.intent_labels))
        
    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor):
        output = self.bert(input_ids, token_type_ids, attention_mask)
        output_dropout = self.dropout(output.pooler_output)
        out_layer_1 = self.layer_1(output_dropout)
        act_1 = self.activation_1(out_layer_1)
        out_layer_2 = self.layer_2(act_1)
        return out_layer_2
        

In [39]:
model = intent_model(config, intent_list, 0.1)
model.to(device)

05/16/2024 01:11:17 - INFO - bert_model_implementation_torch.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz not found in cache, downloading to C:\Users\kikos\AppData\Local\Temp\tmpcjv867ym
100%|██████████| 404400730/404400730 [07:17<00:00, 925346.54B/s] 
05/16/2024 01:18:36 - INFO - bert_model_implementation_torch.file_utils -   copying C:\Users\kikos\AppData\Local\Temp\tmpcjv867ym to cache at C:\Users\kikos\.pytorch_pretrained_bert\a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
05/16/2024 01:18:36 - INFO - bert_model_implementation_torch.file_utils -   creating metadata file for C:\Users\kikos\.pytorch_pretrained_bert\a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
05/16/2024 01:18:36 - INFO - bert_model_implementation_torch.file_utils -   removing temp file C:\Users\kikos\AppData

intent_model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
 

In [40]:
def loss_fn(outputs, targets):
    targets = targets.float()
    return nn.BCEWithLogitsLoss()(outputs, targets)


In [41]:
def train(model, epochs, train_loader, val_loader, optimizer, ckpt_path, best_model_path):
    valid_loss_min = np.Inf
    for epoch in range(1, epochs + 1):
        train_loss = 0
        valid_loss = 0
        model.train()
        for batch_index, batch in tqdm(enumerate(train_loader)):
            input_ids = batch['input_ids'].to(device, dtype= torch.long)
            attention_mask = batch["attention_mask"].to(device, dtype= torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype= torch.long)
            targets = batch["targets"].to(device, dtype= torch.long)
            outputs = model(input_ids, token_type_ids, attention_mask)
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += (1/(batch_index + 1)) * (loss.item() - train_loss)
        print(f"epoch {epoch} ended with train loss of {train_loss}")
        model.eval()
        with torch.no_grad():
            for batch_index, batch in tqdm(enumerate(val_loader)):
                input_ids = batch['input_ids'].to(device, dtype= torch.long)
                attention_mask = batch["attention_mask"].to(device, dtype= torch.long)
                token_type_ids = batch["token_type_ids"].to(device, dtype= torch.long)
                targets = batch["targets"].to(device, dtype= torch.long)
                outputs = model(input_ids, token_type_ids, attention_mask)
                loss = loss_fn(outputs, targets)
                valid_loss += (1/(batch_index + 1)) * (loss.item() - valid_loss)
        print(f"epoch {epoch} ended with train loss of {valid_loss}")
        checkpoint = {
            'epoch': epoch +1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        save_ckpt(checkpoint, False, ckpt_path, best_model_path)
        return model

In [42]:
train = train(model, config["epochs"], train_loader, val_loader, optimizer, config["ckpt_path"], config["ckpt_model_path"])

0it [00:02, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [33]:
config = {
    'max_len': 256,
    'batch_size': 8,
    'epochs': 10,
    'lr':1e-05,
    'out_first_layer': 768,
    'dropout_rate': 0.1,
    'model_dir':'bert-base-cased',
    'ckpt_path': './ckpts',
    'ckpt_model_path': './experiments'
}
import torch.nn.functional as F

In [54]:
class intent_model(nn.Module):
    def __init__(self, config: dict, intent_labels: list[str], dropout: float = 0.1):
        super(intent_model, self).__init__()
        self.config = config
        self.intent_labels = intent_labels
        self.dropout_rate = dropout
        self.bert = BertModel.from_pretrained(self.config["model_dir"])
        self.dropout = nn.Dropout(self.dropout_rate)
        self.layer_1 = nn.Linear(self.bert.config.hidden_size, self.config["out_first_layer"])
        self.activation_1 = nn.ReLU()
        self.layer_2 = nn.Linear(self.config["out_first_layer"], len(self.intent_labels))
        
    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor):
        output = self.bert(input_ids, token_type_ids, attention_mask)
        output_dropout = self.dropout(output.pooler_output)
        out_layer_1 = self.layer_1(output_dropout)
        act_1 = self.activation_1(out_layer_1)
        out_layer_2 = self.layer_2(act_1)
        return out_layer_2
    
    def predict_intent(self, text: str, tokenizer: BertTokenizer):
        # Tokenize input text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        attention_mask = inputs['attention_mask']
        
        # Move tensors to the appropriate device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Make predictions
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            logits = self(input_ids, token_type_ids, attention_mask)
        
        # Convert logits to probabilities
        probs = F.softmax(logits, dim=1)
        
        # Get the predicted label
        predicted_label_idx = torch.argmax(probs, dim=1).item()
        print(predicted_label_idx)
        predicted_label = self.intent_labels[predicted_label_idx]
        
        return predicted_label, probs[0].cpu().numpy()

In [55]:
tokenizer = BertTokenizer.from_pretrained(config["model_dir"])

In [56]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = config["lr"])

In [57]:
model = intent_model(config, intent_list, 0.1)
model.to(device)

intent_model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [58]:
model.bert.resize_token_embeddings(30522)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 30522. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(30522, 768)

In [59]:
model = load_ckpt("./ckpts/ckpt", model, optimizer)

In [61]:
model.predict_intent("Please modify the data in range", tokenizer)

14


('Delete hyperlink',
 array([0.01483624, 0.00962736, 0.01309677, 0.01109273, 0.01267296,
        0.01391257, 0.01042866, 0.01172547, 0.01188861, 0.01360726,
        0.01299947, 0.01062253, 0.01138435, 0.01929716, 0.01973281,
        0.00815214, 0.01179272, 0.01155274, 0.01214933, 0.01029058,
        0.01119961, 0.00984573, 0.01025925, 0.01101724, 0.01066194,
        0.01142314, 0.01046023, 0.01088517, 0.01101524, 0.01014855,
        0.01382952, 0.00946699, 0.01382294, 0.01145752, 0.01230946,
        0.01451662, 0.0112405 , 0.0141823 , 0.00991062, 0.01725945,
        0.01911117, 0.0094124 , 0.0120395 , 0.00818533, 0.01099228,
        0.01047215, 0.00911422, 0.00737221, 0.01044737, 0.0114465 ,
        0.00703791, 0.00761131, 0.00768395, 0.00895203, 0.00710685,
        0.01233352, 0.00776994, 0.00646441, 0.00676312, 0.00951902,
        0.01012736, 0.01270906, 0.0093776 , 0.01227192, 0.01941985,
        0.011088  , 0.0120331 , 0.01338007, 0.00844096, 0.00996742,
        0.00847649, 0.00942

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, AdamW
from tqdm import tqdm
import pandas as pd
import numpy as np
from TorchCRF import CRF


class BertCRFModel(nn.Module):
    def __init__(self, config: dict, intent_labels: list[str], slot_labels: list[str], dropout: float = 0.1):
        super(BertCRFModel, self).__init__()
        self.config = config
        self.intent_labels = intent_labels
        self.slot_labels = slot_labels
        self.dropout_rate = dropout
        
        # Load pre-trained BERT
        self.bert = BertModel.from_pretrained(self.config["model_dir"])
        
        # Dropout layer
        self.dropout = nn.Dropout(self.dropout_rate)
        
        # Intent classification layers
        self.intent_layer_1 = nn.Linear(self.bert.config.hidden_size, self.config["out_first_layer"])
        self.intent_activation_1 = nn.ReLU()
        self.intent_layer_2 = nn.Linear(self.config["out_first_layer"], len(self.intent_labels))
        
        # Slot filling layers
        self.slot_layer_1 = nn.Linear(self.bert.config.hidden_size, self.config["out_first_layer"])
        self.slot_activation_1 = nn.ReLU()
        self.slot_layer_2 = nn.Linear(self.config["out_first_layer"], len(self.slot_labels))
        
        # CRF layer for slot filling
        self.crf = CRF(len(self.slot_labels), batch_first=True)

    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor = None):
        # BERT outputs
        bert_outputs = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # Sequence output for slot filling
        pooled_output = bert_outputs.pooler_output       # Pooled output for intent classification
        
        # Intent classification
        intent_output_dropout = self.dropout(pooled_output)
        intent_out_layer_1 = self.intent_layer_1(intent_output_dropout)
        intent_act_1 = self.intent_activation_1(intent_out_layer_1)
        intent_logits = self.intent_layer_2(intent_act_1)
        
        # Slot filling
        slot_output_dropout = self.dropout(sequence_output)
        slot_out_layer_1 = self.slot_layer_1(slot_output_dropout)
        slot_act_1 = self.slot_activation_1(slot_out_layer_1)
        slot_logits = self.slot_layer_2(slot_act_1)
        
        if labels is not None:
            # CRF loss
            loss = -self.crf(slot_logits, labels, mask=attention_mask.bool(), reduction='mean')
            return loss, intent_logits
        else:
            # CRF decoding
            slot_predictions = self.crf.decode(slot_logits, mask=attention_mask.bool())
            return slot_predictions, intent_logits

# Dataset class
class isf_dataset(Dataset):
    def __init__(self, df: pd.DataFrame, intent_classes: list[str], slot_classes: list[str], tokenizer: BertTokenizer, max_len: int):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.intent_classes = intent_classes
        self.slot_classes = slot_classes
        self.intents = df['intent']
        self.slots = df["slots"]
        self.x = df['prompt']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        title = str(self.x[index])
        title = ''.join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt',
            return_token_type_ids=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        intent_target = self.intent_classes.index(self.intents[index])
        slot_targets = [self.slot_classes.index(slot) for slot in list(self.slots[index])][:self.max_len] + [self.slot_classes.index("O")] * (self.max_len - len(list(self.slots[index])))
        return {
            'input_ids': inputs["input_ids"].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'intent_targets': torch.tensor(intent_target, dtype=torch.long),
            'slot_targets': torch.tensor(slot_targets, dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs, device):
    model.to(device)
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            intent_labels = batch['intent_targets'].to(device)
            slot_labels = batch['slot_targets'].to(device)

            optimizer.zero_grad()

            slot_loss, intent_logits = model(input_ids, token_type_ids, attention_mask, slot_labels)
            intent_loss = nn.CrossEntropyLoss()(intent_logits, intent_labels)
            loss = slot_loss + intent_loss

            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Training loss: {avg_train_loss}")

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                intent_labels = batch['intent_targets'].to(device)
                slot_labels = batch['slot_targets'].to(device)

                slot_loss, intent_logits = model(input_ids, token_type_ids, attention_mask, slot_labels)
                intent_loss = nn.CrossEntropyLoss()(intent_logits, intent_labels)
                loss = slot_loss + intent_loss

                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Validation loss: {avg_val_loss}")

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("Saved the best model!")
    return model

# Example usage
if __name__ == "__main__":
    # Configuration
    config = {
        "model_dir": "bert-base-uncased",
        "out_first_layer": 128,
        "dropout_rate": 0.1,
        "learning_rate": 1e-5,
        "num_epochs": 60,
        "batch_size": 16,
        "max_len": 128
    }
    intent_labels = ["intent1", "intent2", "intent3"]  # Example intents
    slot_labels = ["slot1", "slot2", "slot3", "O"]  # Example slots including "O" for outside of any slot

    # Initialize tokenizer
    tokenizer = BertTokenizer.from_pretrained(config["model_dir"])

    # Create dummy dataframe
    data = {
        "prompt": ["Example sentence one.", "Example sentence two."],
        "intent": ["intent1", "intent2"],
        "slots": [["O", "O", "slot1", "O"], ["O", "O", "slot2", "O"]]
    }
    df = pd.DataFrame(data)

    # Create datasets and dataloaders
    train_dataset = isf_dataset(df, intent_labels, slot_labels, tokenizer, max_len=config["max_len"])
    val_dataset = isf_dataset(df, intent_labels, slot_labels, tokenizer, max_len=config["max_len"])

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)

    # Initialize model
    model = BertCRFModel(config, intent_labels, slot_labels)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=config["learning_rate"])
    total_steps = len(train_loader) * config["num_epochs"]
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Train the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = train_model(model, train_loader, val_loader, optimizer, scheduler, config["num_epochs"], device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Training Epoch 1: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Training loss: 10.891477584838867


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.25it/s]


Validation loss: 10.42165756225586
Saved the best model!


Training Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


Training loss: 10.549955368041992


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.53it/s]


Validation loss: 9.890883445739746
Saved the best model!


Training Epoch 3: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


Training loss: 10.042511940002441


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.57it/s]


Validation loss: 9.409448623657227
Saved the best model!


Training Epoch 4: 100%|██████████| 1/1 [00:00<00:00,  2.78it/s]


Training loss: 9.78728199005127


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.56it/s]


Validation loss: 8.974279403686523
Saved the best model!


Training Epoch 5: 100%|██████████| 1/1 [00:00<00:00,  2.25it/s]


Training loss: 9.540262222290039


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.90it/s]


Validation loss: 8.632535934448242
Saved the best model!


Training Epoch 6: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


Training loss: 9.254839897155762


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.60it/s]


Validation loss: 8.33405876159668
Saved the best model!


Training Epoch 7: 100%|██████████| 1/1 [00:00<00:00,  2.83it/s]


Training loss: 8.824129104614258


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]


Validation loss: 8.039942741394043
Saved the best model!


Training Epoch 8: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


Training loss: 8.45923137664795


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.22it/s]


Validation loss: 7.754698276519775
Saved the best model!


Training Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  2.61it/s]


Training loss: 7.957639694213867


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.04it/s]


Validation loss: 7.477628707885742
Saved the best model!


Training Epoch 10: 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Training loss: 7.7474589347839355


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.48it/s]


Validation loss: 7.199864864349365
Saved the best model!


Training Epoch 11: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]


Training loss: 7.7237443923950195


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.02it/s]


Validation loss: 6.93419075012207
Saved the best model!


Training Epoch 12: 100%|██████████| 1/1 [00:00<00:00,  2.06it/s]


Training loss: 7.044919490814209


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.48it/s]


Validation loss: 6.681292533874512
Saved the best model!


Training Epoch 13: 100%|██████████| 1/1 [00:00<00:00,  2.42it/s]


Training loss: 7.100327491760254


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.57it/s]


Validation loss: 6.42641544342041
Saved the best model!


Training Epoch 14: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]


Training loss: 6.975263595581055


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.09it/s]


Validation loss: 6.193172454833984
Saved the best model!


Training Epoch 15: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s]


Training loss: 6.572647571563721


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.15it/s]


Validation loss: 5.976933479309082
Saved the best model!


Training Epoch 16: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]


Training loss: 6.504485607147217


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]


Validation loss: 5.777250289916992
Saved the best model!


Training Epoch 17: 100%|██████████| 1/1 [00:00<00:00,  2.37it/s]


Training loss: 6.111370086669922


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.64it/s]


Validation loss: 5.597453594207764
Saved the best model!


Training Epoch 18: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]


Training loss: 6.043363571166992


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.36it/s]


Validation loss: 5.433134078979492
Saved the best model!


Training Epoch 19: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]


Training loss: 6.069851875305176


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.91it/s]


Validation loss: 5.280535697937012
Saved the best model!


Training Epoch 20: 100%|██████████| 1/1 [00:00<00:00,  2.14it/s]


Training loss: 5.79055118560791


Validation: 100%|██████████| 1/1 [00:00<00:00,  5.87it/s]


Validation loss: 5.136474609375
Saved the best model!


Training Epoch 21: 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]


Training loss: 5.767029762268066


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]


Validation loss: 5.002470970153809
Saved the best model!


Training Epoch 22: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


Training loss: 5.585514545440674


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.84it/s]


Validation loss: 4.87251091003418
Saved the best model!


Training Epoch 23: 100%|██████████| 1/1 [00:00<00:00,  2.60it/s]


Training loss: 5.567394256591797


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.83it/s]


Validation loss: 4.75618839263916
Saved the best model!


Training Epoch 24: 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Training loss: 5.3152594566345215


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.39it/s]


Validation loss: 4.642885684967041
Saved the best model!


Training Epoch 25: 100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Training loss: 5.15069055557251


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.02it/s]


Validation loss: 4.5320844650268555
Saved the best model!


Training Epoch 26: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]


Training loss: 5.006831169128418


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.15it/s]


Validation loss: 4.43546199798584
Saved the best model!


Training Epoch 27: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


Training loss: 5.222864151000977


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.72it/s]


Validation loss: 4.347010612487793
Saved the best model!


Training Epoch 28: 100%|██████████| 1/1 [00:00<00:00,  2.67it/s]


Training loss: 4.840567588806152


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.57it/s]


Validation loss: 4.270637035369873
Saved the best model!


Training Epoch 29: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s]


Training loss: 4.725435733795166


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.20it/s]


Validation loss: 4.186877250671387
Saved the best model!


Training Epoch 30: 100%|██████████| 1/1 [00:00<00:00,  2.29it/s]


Training loss: 4.926697254180908


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.26it/s]


Validation loss: 4.1031293869018555
Saved the best model!


Training Epoch 31: 100%|██████████| 1/1 [00:00<00:00,  2.64it/s]


Training loss: 4.576796054840088


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.79it/s]


Validation loss: 4.02415132522583
Saved the best model!


Training Epoch 32: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]


Training loss: 4.568554878234863


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.92it/s]


Validation loss: 3.9471068382263184
Saved the best model!


Training Epoch 33: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]


Training loss: 4.522022247314453


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.64it/s]


Validation loss: 3.873081922531128
Saved the best model!


Training Epoch 34: 100%|██████████| 1/1 [00:00<00:00,  2.50it/s]


Training loss: 4.51061487197876


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.48it/s]


Validation loss: 3.7959671020507812
Saved the best model!


Training Epoch 35: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]


Training loss: 4.391411304473877


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.83it/s]


Validation loss: 3.728898525238037
Saved the best model!


Training Epoch 36: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


Training loss: 4.37155818939209


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.83it/s]


Validation loss: 3.6616501808166504
Saved the best model!


Training Epoch 37: 100%|██████████| 1/1 [00:00<00:00,  2.74it/s]


Training loss: 4.415462493896484


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.05it/s]


Validation loss: 3.608078718185425
Saved the best model!


Training Epoch 38: 100%|██████████| 1/1 [00:00<00:00,  2.27it/s]


Training loss: 4.302008628845215


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.99it/s]


Validation loss: 3.5603151321411133
Saved the best model!


Training Epoch 39: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


Training loss: 4.202866554260254


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.59it/s]


Validation loss: 3.5122416019439697
Saved the best model!


Training Epoch 40: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]


Training loss: 4.139654636383057


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]


Validation loss: 3.4679274559020996
Saved the best model!


Training Epoch 41: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


Training loss: 3.9372239112854004


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.22it/s]


Validation loss: 3.4276230335235596
Saved the best model!


Training Epoch 42: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


Training loss: 3.9654245376586914


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.93it/s]


Validation loss: 3.3925280570983887
Saved the best model!


Training Epoch 43: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


Training loss: 3.986928701400757


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.15it/s]


Validation loss: 3.358520030975342
Saved the best model!


Training Epoch 44: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]


Training loss: 3.883028030395508


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.22it/s]


Validation loss: 3.32786226272583
Saved the best model!


Training Epoch 45: 100%|██████████| 1/1 [00:00<00:00,  2.64it/s]


Training loss: 3.8939642906188965


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.57it/s]


Validation loss: 3.29994797706604
Saved the best model!


Training Epoch 46: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]


Training loss: 3.972655773162842


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.32it/s]


Validation loss: 3.276191234588623
Saved the best model!


Training Epoch 47: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


Training loss: 3.8651556968688965


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.90it/s]


Validation loss: 3.2558603286743164
Saved the best model!


Training Epoch 48: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]


Training loss: 3.9538562297821045


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.71it/s]


Validation loss: 3.2382471561431885
Saved the best model!


Training Epoch 49: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]


Training loss: 3.888011932373047


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.15it/s]


Validation loss: 3.222177028656006
Saved the best model!


Training Epoch 50: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


Training loss: 3.8464417457580566


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.09it/s]


Validation loss: 3.207167148590088
Saved the best model!


Training Epoch 51: 100%|██████████| 1/1 [00:00<00:00,  2.69it/s]


Training loss: 3.615452766418457


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.66it/s]


Validation loss: 3.1931121349334717
Saved the best model!


Training Epoch 52: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]


Training loss: 3.685314178466797


Validation: 100%|██████████| 1/1 [00:00<00:00,  6.97it/s]


Validation loss: 3.179657459259033
Saved the best model!


Training Epoch 53: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]


Training loss: 3.5990021228790283


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.01it/s]


Validation loss: 3.167884349822998
Saved the best model!


Training Epoch 54: 100%|██████████| 1/1 [00:00<00:00,  2.14it/s]


Training loss: 3.759942054748535


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.96it/s]


Validation loss: 3.1571643352508545
Saved the best model!


Training Epoch 55: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


Training loss: 3.7579505443573


Validation: 100%|██████████| 1/1 [00:00<00:00,  8.15it/s]


Validation loss: 3.1477248668670654
Saved the best model!


Training Epoch 56: 100%|██████████| 1/1 [00:00<00:00,  2.60it/s]


Training loss: 3.6756649017333984


Validation: 100%|██████████| 1/1 [00:00<00:00,  5.51it/s]


Validation loss: 3.139347553253174
Saved the best model!


Training Epoch 57: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


Training loss: 3.7738795280456543


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.56it/s]


Validation loss: 3.1323094367980957
Saved the best model!


Training Epoch 58: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


Training loss: 3.644618272781372


Validation: 100%|██████████| 1/1 [00:00<00:00,  5.49it/s]


Validation loss: 3.1267738342285156
Saved the best model!


Training Epoch 59: 100%|██████████| 1/1 [00:00<00:00,  2.56it/s]


Training loss: 3.670654535293579


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.90it/s]


Validation loss: 3.1228561401367188
Saved the best model!


Training Epoch 60: 100%|██████████| 1/1 [00:00<00:00,  2.72it/s]


Training loss: 3.5584776401519775


Validation: 100%|██████████| 1/1 [00:00<00:00,  7.95it/s]


Validation loss: 3.120920181274414
Saved the best model!


In [12]:
import torch
from transformers import BertTokenizer
import numpy as np

def predict(model, tokenizer, prompt, intent_labels, slot_labels, max_len, device):
    model.eval()
    
    # Tokenize input
    inputs = tokenizer.encode_plus(
        prompt,
        None,
        add_special_tokens=True,
        return_attention_mask=True,
        return_tensors='pt',
        return_token_type_ids=True,
        padding='max_length',
        max_length=max_len,
        truncation=True
    )
    
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        slot_predictions, intent_logits = model(input_ids, token_type_ids, attention_mask)
    
    # Get intent prediction
    intent_pred = torch.argmax(intent_logits, dim=1).item()
    intent_label = intent_labels[intent_pred]
    
    # Get slot predictions
    slot_predictions = slot_predictions[0]  # since batch size is 1
    slot_labels_pred = [slot_labels[slot] for slot in slot_predictions]
    
    # Decode token ids to words
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    
    # Filter out special tokens
    tokens = [token for token, mask in zip(tokens, attention_mask[0]) if mask == 1]
    slot_labels_pred = slot_labels_pred[:len(tokens)]
    
    return intent_label, list(zip(tokens, slot_labels_pred))

# Example usage
if __name__ == "__main__":
    # Assuming the model and tokenizer have been initialized as in the previous code
    model.load_state_dict(torch.load("best_model.pt"))
    model.to(device)

    # Example prompt
    prompt = "Book a flight from New York to Los Angeles next Monday."

    # Predict
    intent_label, token_slot_pairs = predict(model, tokenizer, prompt, intent_labels, slot_labels, config["max_len"], device)

    print(f"Intent: {intent_label}")
    print("Token - Slot pairs:")
    for token, slot in token_slot_pairs:
        print(f"{token}: {slot}")


Intent: intent1
Token - Slot pairs:
[CLS]: O
book: O
a: O
flight: O
from: O
new: O
york: O
to: O
los: O
angeles: O
next: O
monday: O
.: O
[SEP]: O
