# Intent classifier using BERT

## Import libraries

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import glob
import os
import joblib

## Load data

In [None]:
folder_path = '../../../intent_classifier/data/pre_processed'

csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

texts = df['text']
labels = df['intent']


## Label Encoding

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

print("\nLabel Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

## Tokenizer and Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

## Dataset class

In [None]:
class IntentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=64)
        self.labels = labels

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)


## Train test split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)
train_dataset = IntentDataset(X_train, y_train)
test_dataset = IntentDataset(X_test, y_test)

## Dataloaders

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4)

## Optimizer

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss()

## Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train(model, dataloader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        labels = inputs.pop('labels')

        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


## Evaluation loop

In [None]:
def evaluate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items()}
            labels = inputs.pop('labels')

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

    accuracy = correct_preds / total_preds
    return total_loss / len(dataloader), accuracy

## Model Training

In [None]:
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, optimizer, loss_fn)
    val_loss, val_accuracy = evaluate(model, test_dataloader, loss_fn)

    print(f"\nEpoch {epoch+1}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss:   {val_loss:.4f}")
    print(f"  Val Acc:    {val_accuracy:.2%}")


## Predict Intent function

In [None]:
def predict_intent(query):
    model.eval()
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predicted = torch.argmax(outputs.logits, dim=1).item()
    
    return label_encoder.inverse_transform([predicted])[0]

## Testing

In [None]:
test_query = "Create notes from this document"
predict_intent(test_query)


In [None]:
test_query = "make it short prepare notes"
predict_intent(test_query)

## Model Saving

In [None]:
model.save_pretrained("intent_model")

tokenizer.save_pretrained("intent_model")

joblib.dump(label_encoder, "intent_model/label_encoder.pkl")

print("✅ Model, tokenizer, and label encoder saved to 'intent_model/'")
