In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [3]:
# clean data
df=pd.read_csv('/kaggle/input/dataset2/translated_dataframe.csv')
df.head()

Unnamed: 0,product_name,category,product_brand
0,suavizante,suavizante,perla
1,suavizante doypack 3 fresca primavera,suavizante,suavitel
2,leche tetrabrik chocolate o frutilla,leche,toni
3,lavadora semiautomatica doble tanque 15 kg,lavadoras,innova
4,"si el mundo es mejor, sabe mejor",galletas,colombina


In [4]:
df.shape

(35296, 3)

In [5]:
# Create id2label mapping
id2label = {i: label for i, label in enumerate(df['category'].unique())}
print(id2label)

{0: 'suavizante', 1: 'leche', 2: 'lavadoras', 3: 'galletas', 4: 'coffee', 5: 'jamón', 6: 'bombones', 7: 'queso', 8: 'drinks', 9: 'salchichas', 10: 'aceite de girasol', 11: 'cerveza', 12: 'chocolate', 13: 'aceite', 14: 'Margarine', 15: 'mozzarella', 16: 'yogurt', 17: 'broth', 18: 'mayonesa', 19: 'shampoo', 20: 'papel higiénico', 21: 'helados', 22: 'detergent', 23: 'cereales', 24: 'rice', 25: 'toallitas', 26: 'juegos', 27: 'deodorant', 28: 'compresas', 29: 'pan', 30: 'whisky', 31: 'smart tv', 32: 'couches', 33: 'eau', 34: 'poulet', 35: 'chocolats', 36: 'plats cuisinés', 37: 'desserts', 38: 'jouets', 39: 'vin', 40: 'jeux', 41: 'chips', 42: 'snacks', 43: 'Coke', 44: 'jambon', 45: 'livres', 46: 'boissons', 47: 'champagne', 48: 'fromage', 49: 'beurre', 50: 'saucisses', 51: 'alimentation', 52: 'lessive en capsules', 53: 'jus', 54: 'viande', 55: 'bonbons', 56: 'bière blonde', 57: 'saumon fumé', 58: 'yaourt', 59: 'biscuits', 60: 'foie gras de canard', 61: 'dentifrice', 62: 'Pizza', 63: 'vodka',

In [6]:
import pandas as pd

def load_data(data_file, id2label):
    texts = df[['product_name', 'product_brand']].apply(lambda x: ' product_brand: '.join(x.dropna()), axis=1).tolist()
    labels = [key for label in df['category'] for key, value in id2label.items() if value == label]
    return texts, labels

texts, labels = load_data(df, id2label)
print(texts)
print(labels)


['suavizante product_brand: perla', 'suavizante doypack 3 fresca primavera product_brand: suavitel', 'leche tetrabrik chocolate o frutilla product_brand: toni', 'lavadora semiautomatica doble tanque 15 kg product_brand: innova', 'si el mundo es mejor, sabe mejor product_brand: colombina', 'café gold doypack product_brand: nescafé', 'jamonada product_brand: juris', "bombones fei dun estuche product_brand: ta'riko", 'jamón sanduchero product_brand: e m butidos la vienesa', 'queso holandes 80g product_brand: el caserio', 'cola naranja product_brand: mas, cola gallito', 'salchichas perro caliente product_brand: la europea', 'bombones product_brand: noggy', 'atún lomitos en aceite girasol product_brand: don sancho', 'cerveza sixpack product_brand: coronita extra', "chocolate en polvo product_brand: ta'riko", 'galletas saladas product_brand: salticas', 'aceite product_brand: sabrosón', 'leche entera o semidescremada product_brand: vita', 'margarina 450g + margarina 225g product_brand: nature

In [7]:
length=len(id2label)
print(length)

320


In [8]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=get_device()

class CategoryClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


In [9]:
class XLMRobertaClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(XLMRobertaClassifier, self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [10]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [11]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


In [12]:
# Set up parameters
model_name = 'xlm-roberta-base'
num_classes = length
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5
test_size=0.2
random_state=42

In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=test_size, random_state=random_state)

In [14]:
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
train_dataset = CategoryClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = CategoryClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaClassifier(model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [16]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [17]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7a79bb2969e0>

In [18]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)


Epoch 1/4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.4528
              precision    recall  f1-score   support

           0       1.00      0.04      0.08        25
           1       0.50      0.78      0.61        41
           2       0.00      0.00      0.00        21
           3       0.45      0.84      0.59        69
           4       0.55      0.84      0.67        45
           5       0.36      0.69      0.48        39
           6       0.00      0.00      0.00        24
           7       0.64      0.90      0.75        99
           8       0.26      0.66      0.37        82
           9       0.80      0.84      0.82        19
          10       0.00      0.00      0.00        12
          11       0.80      0.89      0.84        96
          12       0.28      0.80      0.42        56
          13       0.00      0.00      0.00        14
          14       1.00      0.92      0.96        12
          15       0.55      0.93      0.69        30
          16       0.81      0.74      0.77        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6106
              precision    recall  f1-score   support

           0       0.48      0.88      0.62        25
           1       0.74      0.78      0.76        41
           2       0.95      0.86      0.90        21
           3       0.84      0.83      0.83        69
           4       0.57      0.89      0.70        45
           5       0.43      0.79      0.56        39
           6       0.95      0.83      0.89        24
           7       0.94      0.89      0.91        99
           8       0.35      0.70      0.47        82
           9       0.62      0.84      0.71        19
          10       0.00      0.00      0.00        12
          11       0.92      0.91      0.91        96
          12       0.34      0.93      0.50        56
          13       0.00      0.00      0.00        14
          14       1.00      0.92      0.96        12
          15       0.83      0.97      0.89        30
          16       0.78      0.72      0.75        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6643
              precision    recall  f1-score   support

           0       0.79      0.92      0.85        25
           1       0.84      0.78      0.81        41
           2       0.90      0.86      0.88        21
           3       0.89      0.83      0.86        69
           4       0.59      0.93      0.72        45
           5       0.41      0.77      0.53        39
           6       0.95      0.83      0.89        24
           7       0.95      0.89      0.92        99
           8       0.34      0.68      0.45        82
           9       0.73      0.84      0.78        19
          10       0.00      0.00      0.00        12
          11       0.93      0.93      0.93        96
          12       0.44      0.95      0.60        56
          13       0.00      0.00      0.00        14
          14       1.00      0.92      0.96        12
          15       0.94      0.97      0.95        30
          16       0.73      0.77      0.75        53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
torch.save(model.state_dict(), "bert_classifier2.pth")

In [20]:
model.eval()

XLMRobertaClassifier(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

In [21]:
# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

XLMRobertaClassifier(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

In [None]:
# Instantiate your model
model = XLMRobertaTokenizer(model_name, 5)

# Load the saved state dictionary into the model
model.load_state_dict(torch.load('bert_classifier2.pth'))
model.eval()


In [None]:
def predict_category(text, model, tokenizer, device, id2label, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        # Return the label corresponding to the predicted numerical ID from id2label
        return id2label[preds.item()]


In [None]:
text = "For Children Baby 3 Months+ Concentrated Liquid Strawberry Flavour 200mg/5ml 50ml,product_brand: Nurofen"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

predicted_label = predict_category(text, model, tokenizer, device, id2label)
print("Predicted label:", predicted_label)