In [1]:
# from bert_model_implementation_torch.model import BertModel
# from bert_model_implementation_torch.tokenization import  _is_control,_is_whitespace,_is_punctuation
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import torch 
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm
from keras.src.utils import to_categorical
from torch.utils.data import DataLoader
import shutil
import re

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
def mask_tokens(text: str):
    return re.sub(r'<[^>]+>', '[MASK]', text)

In [3]:
!mkdir ckpts experiments

In [5]:
data = pd.read_csv("./balanced_data.csv")

In [6]:
data.head()

Unnamed: 0,prompt,intent,classes
0,Create a spider chart comparing employee perfo...,Set trend line,charts
1,I need to divide the text in cell <Cell> by th...,Split text to columns,entry and manipulation
2,How do you hide formulas on Expenses,Display formulas,formatting
3,"Relative to customer, Adjust Pivot Table Sourc...",Change Pivot Table Source Data,pivot tables
4,Please strip away the formatting from these ce...,Delete format,formatting


In [7]:
import os
wd = os.getcwd()
config = {
    'max_len': 256,
    'batch_size': 8,
    'epochs': 10,
    'lr':1e-07,
    'out_first_layer': 768,
    'dropout_rate': 0.1,
    'model_dir':'bert-base-cased',
    'ckpt_path': os.path.join(wd, 'kaggle\\working\\ckpts'),
    'ckpt_model_path': os.path.join(wd, 'kaggle\\working\\experiments')
}

In [8]:
intent_list = data.intent.unique().tolist()

In [10]:
class IntentDataset:
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer, max_len: int):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.classes = df['intent'].unique().tolist()
        self.y = df['intent']
        self.x = df['prompt']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        title = str(self.x[index])
        title = ''.join(title.split())
        title = mask_tokens(title)  # Assuming mask_tokens is a function defined elsewhere
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt',
            return_token_type_ids=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        target = self.classes.index(self.y[index])  # Get the class index
        return {
            'input_ids': inputs["input_ids"].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)  # Directly use class index
        }

In [11]:
train = data.sample(frac=0.9, random_state=200).reset_index(drop=True)
val = data.drop(train.index).reset_index(drop=True)

In [12]:
tokenizer = BertTokenizer.from_pretrained(config["model_dir"])
train_dataset = IntentDataset(train, tokenizer, config["max_len"])
val_dataset = IntentDataset(val, tokenizer, config["max_len"])

In [13]:
train_dataset.__getitem__(10)

{'input_ids': tensor([ 101,  100,  103, 1105,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, 

In [14]:
train_loader = DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = config["batch_size"],
    num_workers = 0
    )

val_loader = DataLoader(
    val_dataset,
    shuffle = False,
    batch_size = config["batch_size"],
    num_workers = 0
    )

In [15]:
train_loader.dataset[10]['targets']


tensor(10)

In [16]:
len(train_loader.dataset[0]['input_ids'])


256

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [18]:
def load_ckpt(ckpt_path, model, optimizer):
    ckpt = torch.load(ckpt_path)
    model.load_state_dict(ckpt['state_dict'])
    optimizer.load_state_dict(ckpt['state_dict'])
    valid_loss_min = ckpt['valid_loss_min']
    return model, optimizer, ckpt['epoch'], valid_loss_min.item()

In [19]:
def save_ckpt(state, is_best, ckpt_path, best_model_path):
    f_path= ckpt_path
    torch.save(state, f_path)
    if is_best:
        best_f_path = best_model_path
        shutil.copyfile(f_path, best_f_path)

In [20]:
class intent_model(nn.Module):
    def __init__(self, config: dict, intent_labels: list[str], dropout: float = 0.1):
        super(intent_model,self).__init__()
        self.config = config
        self.intent_labels = intent_labels
        self.dropout_rate = dropout
        self.bert = BertModel.from_pretrained(self.config["model_dir"])
        self.dropout = nn.Dropout(self.dropout_rate)
        self.layer_1 = nn.Linear(self.bert.config.hidden_size,self.config["out_first_layer"])
        self.activation_1 = nn.ReLU()
        self.layer_2 = nn.Linear(self.config["out_first_layer"], len(self.intent_labels))
        
    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, attention_mask: torch.Tensor):
        output = self.bert(input_ids, token_type_ids, attention_mask)
        output_dropout = self.dropout(output.pooler_output)
        out_layer_1 = self.layer_1(output_dropout)
        act_1 = self.activation_1(out_layer_1)
        out_layer_2 = self.layer_2(act_1)
        return out_layer_2
        

In [21]:
model = intent_model(config, intent_list, 0.1)
model.to(device)
def loss_fn(outputs, targets):
    targets = targets.float()
    return nn.CrossEntropyLoss()(outputs, targets)

In [22]:
loss_fn = nn.CrossEntropyLoss()

In [23]:


optimizer = torch.optim.Adam(params = model.parameters(), lr = config["lr"])

In [24]:
# checkpoint = {
#     'epoch': 10 +1,
#     'valid_loss_min': 0,
#     'state_dict': model.state_dict(),
#     'optimizer': optimizer.state_dict()
# }
# save_ckpt(checkpoint, False, config["ckpt_path"], config["ckpt_model_path"])

In [26]:

def train(model, epochs, train_loader, val_loader, optimizer, ckpt_path, best_model_path):
    valid_loss_min = np.Inf
    for epoch in range(1, epochs + 1):
        train_loss = 0
        val_loss = 0
        model.train()
        for batch_index, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
            targets = batch["targets"].to(device, dtype=torch.long)
            
            optimizer.zero_grad()
            outputs = model(input_ids, token_type_ids, attention_mask)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += (1 / (batch_index + 1)) * (loss.item() - train_loss)
        print(f"Epoch {epoch} ended with train loss of {train_loss}")
        
        model.eval()
        with torch.no_grad():
            for batch_index, batch in tqdm(enumerate(val_loader), total=len(val_loader)):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
                token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
                targets = batch["targets"].to(device, dtype=torch.long)
                
                outputs = model(input_ids, token_type_ids, attention_mask)
                loss = loss_fn(outputs, targets)
                
                val_loss += (1 / (batch_index + 1)) * (loss.item() - val_loss)
        print(f"Epoch {epoch} ended with val loss of {val_loss}")
        
        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': val_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        save_ckpt(checkpoint, False, ckpt_path, best_model_path)  # Assuming save_ckpt is defined elsewhere

    return model
model = train(model, config["epochs"], train_loader, val_loader, optimizer, config["ckpt_path"], config["ckpt_model_path"])

  3%|▎         | 121/3589 [07:18<3:29:19,  3.62s/it]


KeyboardInterrupt: 

In [None]:
import os
import subprocess
from IPython.display import FileLink, display
import torch.nn.functional as F
def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))


In [None]:
def predict_intent(model, text: str, tokenizer: BertTokenizer):
        # Tokenize input text
        text = mask_tokens(text)
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        attention_mask = inputs['attention_mask']
        
        # Move tensors to the appropriate device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        input_ids = input_ids.to(device)
        token_type_ids = token_type_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Make predictions
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            logits = model(input_ids, token_type_ids, attention_mask)
        
        # Convert logits to probabilities
        probs = F.softmax(logits, dim=1)
        
        # Get the predicted label
        predicted_label_idx = torch.argmax(probs, dim=1).item()
        print(predicted_label_idx,torch.max(probs, dim=1))
        predicted_label = model.intent_labels[predicted_label_idx]
        
        return predicted_label, probs[0].cpu().numpy()

In [None]:
predict_intent(model, "sdavbasjdvasdkvbajsdjv sdcsdvsddsgsdfs", tokenizer)

In [None]:
download_file("./", "ckpt")