In [1]:
# Common and Infer 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer, CamembertForSequenceClassification
import pickle
import warnings
warnings.simplefilter("ignore")


# display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 2000)



class CFG:
    model_name = 'camembert-base'
    base_path = '../../data/'
    dataset_name = 'category_data_unique_title'
    save_dir = '../../models/cambert/'
    encoder_name = 'label_encoder' #for saving the encoder
    # batch_size = 64
    max_length = 40
    # learning_rate = 2e-5
    # epochs = 15
    # do_monitoring = True

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data(base_path, dataset_name):
    df = pd.read_csv(f'{base_path}/{dataset_name}.csv')
    return df

def preprocess_data(df):
    le = LabelEncoder()
    df['Nature_encoded'] = le.fit_transform(df['Nature'])
    return df, le

def compute_class_weights(df):
    class_weights = compute_class_weight('balanced', classes=df['Nature_encoded'].unique(), y=df['Nature_encoded'])
    return torch.tensor(class_weights, dtype=torch.float)

def split_data(df, test_size=0.2, random_state=42):
    return train_test_split(df['Libellé produit'], df['Nature_encoded'], test_size=test_size, random_state=random_state)

def encode_texts(texts, tokenizer, max_length=CFG.max_length):
    return tokenizer.batch_encode_plus(
        texts.tolist(),
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True
    )

In [3]:
#Inference
class CamembertPredictor:
    def __init__(self, model_path, tokenizer_path, label_encoder_path, max_length=CFG.max_length):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_length = max_length
        
        self.model = CamembertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = CamembertTokenizer.from_pretrained(tokenizer_path)
        with open(label_encoder_path, "rb") as f:
            self.le = pickle.load(f)
        
        self.model.to(self.device)
        self.model.eval()

    def predict_category_single(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            prediction = torch.argmax(outputs.logits, dim=1).item()
        
        return self.le.inverse_transform([prediction])[0]

    def predict_category_dataset(self, df, text_column='Libellé produit'):
        df['Predicted_Nature'] = df[text_column].apply(self.predict_category_single)
        return df

In [4]:
predictor = CamembertPredictor(model_path=f'{CFG.save_dir}/{CFG.model_name}', tokenizer_path=f'{CFG.save_dir}/{CFG.model_name}', label_encoder_path=f'{CFG.save_dir}/{CFG.encoder_name}.pkl')


# Example usage
sample_text = "Table à manger verre quadro transparent blanc"
predicted_category = predictor.predict_category_single(sample_text)
print(f"Predicted category for single text: {predicted_category}")

# Example Dataset
test = pd.read_csv(f'{CFG.base_path}/category_data.csv') 
sample_test = test.sample(n=1000, random_state=42)
predicted_df = predictor.predict_category_dataset(sample_test)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Predicted category for single text: table


In [5]:
# Find differently categorized lines
predicted_df['Differently_Categorized'] = predicted_df['Nature'] != predicted_df['Predicted_Nature']

# Recategorize the entire dataset
predicted_df['Recategorized_Nature'] = predicted_df['Predicted_Nature']

In [6]:
# Print statistics
print("Number of differently categorized items:", predicted_df['Differently_Categorized'].sum())
print("Percentage of differently categorized items: {:.2f}%".format(predicted_df['Differently_Categorized'].mean() * 100))

# Display some examples of recategorized items
print("\nExamples of recategorized items:")
predicted_df[predicted_df['Differently_Categorized']][['Libellé produit', 'Nature', 'Recategorized_Nature']]



Number of differently categorized items: 99
Percentage of differently categorized items: 9.90%

Examples of recategorized items:


Unnamed: 0,Libellé produit,Nature,Recategorized_Nature
384591,simpur relax matelas 140x190 therapy carbone a...,meuble a chaussures,matelas
181838,lampadaire,lampadaire liseuse,lampadaire
125016,aspirateur balai rechargeable,aspirateur balai,mini aspirateur
14989,austin canape 3 places convertible ouverture ...,canape droit,friteuse
147427,destructeur d odeur citron,souffleur aspirateur,senteur
852,saladier avec couverts oiseaux blanc,rangement a poser,vaisselle
465748,tiroir de rangement,rangement enfant,boite de rangement
75659,canape fixe 2 places en tissu,friteuse,canape droit
64620,canape d angle reversible convertible 4 places,friteuse,canape d'angle
247947,loge tout,buffet de cuisine,buffet
