In [1]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [3]:
# Load and preprocess dataset
df = pd.read_csv('/content/sofmattress_train.csv')

In [4]:
# Label encoding
le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['label'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['sentence'], df['encoded_labels'], test_size=0.2, random_state=42
)

In [5]:
# Define Dataset class
class IntentDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            sentence, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [6]:
# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_))

# Prepare datasets and dataloaders
train_dataset = IntentDataset(X_train, y_train, tokenizer, max_len=32)
test_dataset = IntentDataset(X_test, y_test, tokenizer, max_len=32)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Optimizer, scheduler, and loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*20)
criterion = torch.nn.CrossEntropyLoss()

# Early stopping variables
best_loss = float('inf')
early_stop_count = 0
patience = 3

# Training loop
for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = (
            batch['input_ids'].to(device),
            batch['attention_mask'].to(device),
            batch['label'].to(device),
        )
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # used clipping to preventing the exploding gradient problem
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = (
                batch['input_ids'].to(device),
                batch['attention_mask'].to(device),
                batch['label'].to(device),
            )
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
    val_loss /= len(test_loader)
    print(f"Validation Loss: {val_loss:.4f}")

    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        early_stop_count = 0
        torch.save(model.state_dict(), "best_model.pth")  # Saving the best model
    else:
        early_stop_count += 1
        if early_stop_count >= patience:
            print("Early stopping triggered.")
            break

# Load the best model for evaluation
model.load_state_dict(torch.load("best_model.pth"))

Epoch 1, Loss: 1.7014
Validation Loss: 1.5386
Epoch 2, Loss: 0.9718
Validation Loss: 1.1234
Epoch 3, Loss: 0.4987
Validation Loss: 0.8652
Epoch 4, Loss: 0.2393
Validation Loss: 0.7013
Epoch 5, Loss: 0.1120
Validation Loss: 0.7178
Epoch 6, Loss: 0.0536
Validation Loss: 0.6970
Epoch 7, Loss: 0.0321
Validation Loss: 0.6796
Epoch 8, Loss: 0.0215
Validation Loss: 0.6873
Epoch 9, Loss: 0.0171
Validation Loss: 0.7034
Epoch 10, Loss: 0.0139
Validation Loss: 0.7158
Early stopping triggered.


  model.load_state_dict(torch.load("best_model.pth"))


<All keys matched successfully>

In [14]:
# Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = (
            batch['input_ids'].to(device),
            batch['attention_mask'].to(device),
            batch['label'].to(device),
        )
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [15]:
# Classification report
print(classification_report(all_labels, all_preds, target_names=le.classes_))

                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       1.00      0.50      0.67         4
   ABOUT_SOF_MATTRESS       1.00      0.67      0.80         3
         CANCEL_ORDER       1.00      1.00      1.00         2
        CHECK_PINCODE       0.50      1.00      0.67         1
                  COD       0.67      1.00      0.80         2
           COMPARISON       1.00      1.00      1.00         1
    DELAY_IN_DELIVERY       0.00      0.00      0.00         2
         DISTRIBUTORS       1.00      0.88      0.93         8
                  EMI       1.00      0.80      0.89         5
        ERGO_FEATURES       1.00      1.00      1.00         4
             LEAD_GEN       0.75      0.75      0.75         4
        MATTRESS_COST       1.00      1.00      1.00         3
               OFFERS       1.00      1.00      1.00         3
         ORDER_STATUS       0.33      1.00      0.50         1
       ORTHO_FEATURES       1.00      1.00      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Displaing actual vs predicted labels
actual_labels = [le.classes_[label] for label in all_labels]
predicted_labels = [le.classes_[pred] for pred in all_preds]

comparison_df = pd.DataFrame({
    'Sentence': X_test.reset_index(drop=True),
    'Actual Label': actual_labels,
    'Predicted Label': predicted_labels
})

print(comparison_df.head(20))

                                      Sentence           Actual Label  \
0                                View products       PRODUCT_VARIANTS   
1   Will I get an option to Customise the size     SIZE_CUSTOMIZATION   
2                        Tell me about company     ABOUT_SOF_MATTRESS   
3                                   How to EMI                    EMI   
4                 Trial offer on customisation  100_NIGHT_TRIAL_OFFER   
5                What are the product variants       PRODUCT_VARIANTS   
6                         Do you sell pillows?                PILLOWS   
7                                Mattress cost          MATTRESS_COST   
8                  What does the mattress cost          MATTRESS_COST   
9                     100 Nights trial version  100_NIGHT_TRIAL_OFFER   
10                Is there any offline stores            DISTRIBUTORS   
11                       Are Pillows available                PILLOWS   
12                       Give me some discount     