In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import BertModel

In [2]:
# clean data
df=pd.read_csv('/kaggle/input/dataset2-300-category/translated_dataframe.csv')
df.head()

Unnamed: 0,product_name,category,product_brand
0,suavizante,suavizante,perla
1,suavizante doypack 3 fresca primavera,suavizante,suavitel
2,leche tetrabrik chocolate o frutilla,leche,toni
3,lavadora semiautomatica doble tanque 15 kg,lavadoras,innova
4,"si el mundo es mejor, sabe mejor",galletas,colombina


In [3]:
df.shape

(35296, 3)

In [4]:
# Create id2label mapping
id2label = {i: label for i, label in enumerate(df['category'].unique())}
print(id2label)

{0: 'suavizante', 1: 'leche', 2: 'lavadoras', 3: 'galletas', 4: 'coffee', 5: 'jamón', 6: 'bombones', 7: 'queso', 8: 'drinks', 9: 'salchichas', 10: 'aceite de girasol', 11: 'cerveza', 12: 'chocolate', 13: 'aceite', 14: 'Margarine', 15: 'mozzarella', 16: 'yogurt', 17: 'broth', 18: 'mayonesa', 19: 'shampoo', 20: 'papel higiénico', 21: 'helados', 22: 'detergent', 23: 'cereales', 24: 'rice', 25: 'toallitas', 26: 'juegos', 27: 'deodorant', 28: 'compresas', 29: 'pan', 30: 'whisky', 31: 'smart tv', 32: 'couches', 33: 'eau', 34: 'poulet', 35: 'chocolats', 36: 'plats cuisinés', 37: 'desserts', 38: 'jouets', 39: 'vin', 40: 'jeux', 41: 'chips', 42: 'snacks', 43: 'Coke', 44: 'jambon', 45: 'livres', 46: 'boissons', 47: 'champagne', 48: 'fromage', 49: 'beurre', 50: 'saucisses', 51: 'alimentation', 52: 'lessive en capsules', 53: 'jus', 54: 'viande', 55: 'bonbons', 56: 'bière blonde', 57: 'saumon fumé', 58: 'yaourt', 59: 'biscuits', 60: 'foie gras de canard', 61: 'dentifrice', 62: 'Pizza', 63: 'vodka',

In [None]:
import pandas as pd

def load_data(data_file, id2label):
    texts = df[['product_name', 'product_brand']].apply(lambda x: ' product_brand: '.join(x.dropna()), axis=1).tolist()
    labels = [key for label in df['category'] for key, value in id2label.items() if value == label]
    return texts, labels

texts, labels = load_data(df, id2label)
print(texts)
print(labels)


In [None]:
length=len(id2label)
print(length)

In [None]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=get_device()

class CategoryClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = length
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5
test_size=0.2
random_state=42

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=test_size, random_state=random_state)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [14]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = CategoryClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = CategoryClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [16]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [17]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x79de50834610>

In [18]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.4266
              precision    recall  f1-score   support

           0       1.00      0.80      0.89        25
           1       0.38      0.80      0.51        41
           2       1.00      0.86      0.92        21
           3       0.52      0.84      0.64        69
           4       0.23      0.93      0.37        45
           5       0.74      0.64      0.68        39
           6       1.00      0.42      0.59        24
           7       0.65      0.89      0.75        99
           8       0.18      0.57      0.28        82
           9       1.00      0.58      0.73        19
          10       0.18      0.58      0.28        12
          11       0.77      0.92      0.83        96
          12       0.29      0.88      0.43        56
          13       0.00      0.00      0.00        14
          14       1.00      0.33      0.50        12
          15       0.31      0.90      0.47        30
          16       0.87      0.75      0.81        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6285
              precision    recall  f1-score   support

           0       0.95      0.80      0.87        25
           1       0.52      0.83      0.64        41
           2       1.00      0.90      0.95        21
           3       0.97      0.86      0.91        69
           4       0.50      0.96      0.66        45
           5       0.59      0.77      0.67        39
           6       0.95      0.83      0.89        24
           7       0.96      0.89      0.92        99
           8       0.27      0.65      0.38        82
           9       0.94      0.89      0.92        19
          10       0.86      0.50      0.63        12
          11       0.93      0.97      0.95        96
          12       0.44      0.96      0.60        56
          13       0.00      0.00      0.00        14
          14       0.92      0.92      0.92        12
          15       0.88      0.93      0.90        30
          16       0.84      0.77      0.80        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6820
              precision    recall  f1-score   support

           0       1.00      0.84      0.91        25
           1       0.81      0.85      0.83        41
           2       0.95      0.90      0.93        21
           3       0.92      0.87      0.90        69
           4       0.57      0.96      0.72        45
           5       0.48      0.82      0.61        39
           6       0.95      0.83      0.89        24
           7       0.94      0.90      0.92        99
           8       0.40      0.66      0.50        82
           9       0.89      0.84      0.86        19
          10       0.86      0.50      0.63        12
          11       0.94      0.97      0.95        96
          12       0.46      0.95      0.62        56
          13       0.00      0.00      0.00        14
          14       0.92      0.92      0.92        12
          15       0.85      0.93      0.89        30
          16       0.88      0.79      0.83        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
torch.save(model.state_dict(), "bert_classifier2.pth")

In [20]:
model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [21]:
# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Evaluating

In [22]:
# Instantiate your model
model = BERTClassifier("bert-base-uncased", length)

# Load the saved state dictionary into the model
model.load_state_dict(torch.load('bert_classifier2.pth'))
model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [23]:
def predict_category(text, model, tokenizer, device, id2label, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        # Return the label corresponding to the predicted numerical ID from id2label
        return id2label[preds.item()]

In [None]:
text = "For Children Baby 3 Months+ Concentrated Liquid Strawberry Flavour 200mg/5ml 50ml,product_brand: Nurofen"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

predicted_label = predict_category(text, model, tokenizer, device, id2label)
print("Predicted label:", predicted_label)