In [1]:
# Common and Infer 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
import pickle
import warnings
warnings.simplefilter("ignore")

#Training Imports
from sklearn.metrics import classification_report
from transformers import AdamW, get_linear_schedule_with_warmup

#optional to be used for monitoring training
import wandb 
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key")

class CFG:
    model_name = 'camembert-base'
    base_path = '/kaggle/input/ecommerce-recategorize/'
    dataset_name = 'category_data_unique_title'
    save_dir = '/kaggle/working/model/'
    encoder_name = 'label_encoder' #for saving the encoder
    batch_size = 8
    max_length = 40
    learning_rate = 2e-5
    epochs = 15
    do_monitoring = True

In [2]:
def load_data(base_path, dataset_name):
    df = pd.read_csv(f'{base_path}/{dataset_name}.csv')
    return df

def preprocess_data(df):
    le = LabelEncoder()
    df['Nature_encoded'] = le.fit_transform(df['Nature'])
    return df, le

def compute_class_weights(df):
    class_weights = compute_class_weight('balanced', classes=df['Nature_encoded'].unique(), y=df['Nature_encoded'])
    return torch.tensor(class_weights, dtype=torch.float)

def split_data(df, test_size=0.2, random_state=42):
    return train_test_split(df['Libellé produit'], df['Nature_encoded'], test_size=test_size, random_state=random_state)

def encode_texts(texts, tokenizer, max_length=CFG.max_length):
    return tokenizer.batch_encode_plus(
        texts.tolist(),
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True
    )

In [3]:
# Trainer 
class CamembertTrainer:
    def __init__(self, model_name, base_path, dataset_name, batch_size=CFG.batch_size, max_length=CFG.max_length, do_monitoring=CFG.do_monitoring):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model_name = model_name
        self.batch_size = batch_size
        self.max_length = max_length
        
        self.df = load_data(base_path, dataset_name)
        self.df, self.le = preprocess_data(self.df)
        self.class_weights = compute_class_weights(self.df).to(self.device)
        
        self.X_train, self.X_test, self.y_train, self.y_test = split_data(self.df)
        
        self.tokenizer = CamembertTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            
        self.train_encodings = encode_texts(self.X_train, self.tokenizer, self.max_length)
        self.test_encodings = encode_texts(self.X_test, self.tokenizer, self.max_length)
        
        self.train_dataset = TensorDataset(
            torch.tensor(self.train_encodings['input_ids']),
            torch.tensor(self.train_encodings['attention_mask']),
            torch.tensor(self.y_train.tolist())
        )
        
        self.test_dataset = TensorDataset(
            torch.tensor(self.test_encodings['input_ids']),
            torch.tensor(self.test_encodings['attention_mask']),
            torch.tensor(self.y_test.tolist())
        )
        
        self.train_dataloader = DataLoader(self.train_dataset, sampler=RandomSampler(self.train_dataset), batch_size=self.batch_size)
        self.test_dataloader = DataLoader(self.test_dataset, sampler=SequentialSampler(self.test_dataset), batch_size=self.batch_size)
        
        self.model = CamembertForSequenceClassification.from_pretrained(self.model_name, num_labels=len(self.le.classes_))
        self.model.to(self.device)
        
        self.optimizer = AdamW(self.model.parameters(), lr=CFG.learning_rate)
        total_steps = len(self.train_dataloader) * CFG.epochs  # 15 epochs
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=int(total_steps * 0.1), num_training_steps=total_steps)
        
        self.do_monitoring = do_monitoring
        
        if self.do_monitoring:
            wandb.login(key=my_secret)
            wb_config = dict(max_length = CFG.max_length,learning_rate = CFG.learning_rate,epochs = CFG.epochs)
            wandb.init(project="camabert-product-categorization", entity="jztinchawda", config= wb_config)  
        
    def compute_loss(self, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(self.device)
        outputs = self.model(**inputs)
        logits = outputs.logits.to(self.device)
        loss_fn = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        total_correct = 0
        total_predictions = 0
        
        for batch in dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            
            with torch.no_grad():
                loss, outputs = self.compute_loss(inputs, return_outputs=True)
            
            total_loss += loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_correct += torch.sum(predictions == batch[2]).item()
            total_predictions += batch[2].size(0)
        
        avg_loss = total_loss / len(dataloader)
        accuracy = total_correct / total_predictions
        return avg_loss, accuracy
    
    def generate_classification_report(self):
        self.model.eval()
        predictions = []
        true_labels = []

        for batch in self.test_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(batch[2].cpu().numpy())

        
        true_labels = np.array(true_labels)
        predictions = np.array(predictions)

        
        unique_labels = np.unique(true_labels)
        predicted_labels = np.unique(predictions)
        labels = np.unique(np.concatenate([unique_labels, predicted_labels]))

        
        print(classification_report(true_labels, predictions, labels=labels, target_names=self.le.classes_))
    
    def train(self, epochs=CFG.epochs, early_stopping_patience=3):
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            total_correct = 0
            total_predictions = 0
            
            for batch in self.train_dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
                
                self.optimizer.zero_grad()
                loss, outputs = self.compute_loss(inputs, return_outputs=True)
                total_loss += loss.item()
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                total_correct += torch.sum(predictions == batch[2]).item()
                total_predictions += batch[2].size(0)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                self.scheduler.step()
            
            avg_train_loss = total_loss / len(self.train_dataloader)
            train_accuracy = total_correct / total_predictions
            
            val_loss, val_accuracy = self.evaluate(self.test_dataloader)
            
            print(f"Epoch {epoch+1}")
            print(f"Average training loss: {avg_train_loss:.4f} | Training accuracy: {train_accuracy:.4f}")
            print(f"Validation loss: {val_loss:.4f} | Validation accuracy: {val_accuracy:.4f}")
            
            if self.do_monitoring:
                wandb.log({
                'epoch': epoch + 1,
                'train_loss': avg_train_loss,
                'train_accuracy': train_accuracy,
                'val_loss': val_loss,
                'val_accuracy': val_accuracy
            })
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                trainer.save_model(f'{CFG.save_dir}/{CFG.model_name}', f'{CFG.save_dir}/{CFG.model_name}', f'{CFG.save_dir}/{CFG.encoder_name}.pkl')
            else:
                patience_counter += 1
                print(f'Triggered Patience with {patience_counter}')
                if patience_counter >= early_stopping_patience:
                    print("Early stopping triggered.")
                    break
                    
        
        print("Training complete!")
        self.generate_classification_report()

    def save_model(self, model_path, tokenizer_path, label_encoder_path):
        self.model.save_pretrained(model_path)
        self.tokenizer.save_pretrained(tokenizer_path)
        
        with open(label_encoder_path, "wb") as f:
            pickle.dump(self.le, f)

In [4]:
trainer = CamembertTrainer(model_name=CFG.model_name, base_path=CFG.base_path, dataset_name=CFG.dataset_name)
trainer.train()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjztinchawda[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.17.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240714_125231-v0iogrvi[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwise-gorge-28[0m
[34m[1mwandb[0m: ⭐️ View project 

Epoch 1
Average training loss: 5.2849 | Training accuracy: 0.0388
Validation loss: 3.7981 | Validation accuracy: 0.1301
Epoch 2
Average training loss: 2.6259 | Training accuracy: 0.3126
Validation loss: 1.7688 | Validation accuracy: 0.5035
Epoch 3
Average training loss: 1.4124 | Training accuracy: 0.5658
Validation loss: 1.2068 | Validation accuracy: 0.6190
Epoch 4
Average training loss: 0.9721 | Training accuracy: 0.6510
Validation loss: 1.0115 | Validation accuracy: 0.6774
Epoch 5
Average training loss: 0.7547 | Training accuracy: 0.7023
Validation loss: 0.9188 | Validation accuracy: 0.7137
Epoch 6
Average training loss: 0.6155 | Training accuracy: 0.7427
Validation loss: 0.8691 | Validation accuracy: 0.7283
Epoch 7
Average training loss: 0.5035 | Training accuracy: 0.7750
Validation loss: 0.8502 | Validation accuracy: 0.7521
Epoch 8
Average training loss: 0.4171 | Training accuracy: 0.8014
Validation loss: 0.8502 | Validation accuracy: 0.7641
Triggered Patience with 1
Epoch 9
Averag

In [5]:
#Inference
class CamembertPredictor:
    def __init__(self, model_path, tokenizer_path, label_encoder_path, max_length=CFG.max_length):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_length = max_length
        
        self.model = CamembertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = CamembertTokenizer.from_pretrained(tokenizer_path)
        with open(label_encoder_path, "rb") as f:
            self.le = pickle.load(f)
        
        self.model.to(self.device)
        self.model.eval()

    def predict_category_single(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            prediction = torch.argmax(outputs.logits, dim=1).item()
        
        return self.le.inverse_transform([prediction])[0]

    def predict_category_dataset(self, df, text_column='Libellé produit'):
        df['Predicted_Nature'] = df[text_column].apply(self.predict_category_single)
        return df

In [6]:
predictor = CamembertPredictor(model_path=f'{CFG.save_dir}/{CFG.model_name}', tokenizer_path=f'{CFG.save_dir}/{CFG.model_name}', label_encoder_path=f'{CFG.save_dir}/{CFG.encoder_name}.pkl')


# Example usage
sample_text = "Table à manger verre quadro transparent blanc"
predicted_category = predictor.predict_category_single(sample_text)
print(f"Predicted category for single text: {predicted_category}")

# Example Dataset
test = pd.read_csv(f'{CFG.base_path}/category_data.csv') #takes more than 40min in GPU 5M Row
sample_test = test.sample(n=50, random_state=42)
predicted_df = predictor.predict_category_dataset(sample_test)
predicted_df.head(20)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predicted category for single text: table


Unnamed: 0,Libellé produit,Nature,Predicted_Nature
178910,refrigerateur combine,refrigerateur,refrigerateur
64871,televiseur 80 cm hd led,tv ecran plat,tv ecran plat
384591,simpur relax matelas 140x190 therapy carbone a...,meuble a chaussures,matelas
3185,vidaxl tapis moderne design de cercles 120 x 1...,tapis de salon et ch,tapis de salon et ch
226294,simpur relax matelas 140x190 real confort epai...,matelas,matelas
161418,bureau coin angle en forme l table angle sello...,bureau,bureau
344816,vasagle coiffeuse avec miroir pliable table de...,coiffeuse,coiffeuse
146338,slide ensemble meubles tv unite murale style m...,meuble tv,meuble tv
476151,lave linge sechant ouverture hublot 8 5 kg,lave linge,lave linge
162177,matelas mousse 90x190 cm,matelas,matelas
