In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391 entries, 0 to 390
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   391 non-null    int64 
 1   url          391 non-null    object
 2   title        391 non-null    object
 3   description  390 non-null    object
 4   cat1         391 non-null    object
 5   cat2         318 non-null    object
 6   cat3         135 non-null    object
dtypes: int64(1), object(6)
memory usage: 21.5+ KB


In [4]:
df.head()

 # Remplacer les NaN par des chaînes vides
df = df.fillna('')

# Concaténer les colonnes en une seule
df['combined'] = df.apply(lambda row: '_'.join([row['cat1'], row['cat2'], row['cat3']]), axis=1)

# Appliquer le one-hot encoding sur la colonne combinée
one_hot_combined = df['combined'].str.get_dummies(sep='_')

y = one_hot_combined.to_numpy()

In [5]:
y = one_hot_combined.to_numpy()

#import train_test_split
from sklearn.model_selection import train_test_split

X = df['title'] + ' ' + df['description']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
import numpy as np
import tensorflow as tf
from transformers import TFCamembertModel, CamembertTokenizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Charger le tokenizer et le modèle Camembert
tokenizer = CamembertTokenizer.from_pretrained("jplu/tf-camembert-base")
camembert_model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")

for layer in camembert_model.layers:
    layer.trainable = False

# Définir les cibles (nb_targets) comme le nombre de colonnes dans y
nb_targets = y.shape[1]

# Définir la longueur maximale de la séquence
max_seq_length = 128

# Créer une classe de couche personnalisée pour encapsuler Camembert
class CamembertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, camembert_model, **kwargs):
        super(CamembertEmbeddingLayer, self).__init__(**kwargs)
        self.camembert_model = camembert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.camembert_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

# Définir les entrées du modèle
input_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="attention_mask")

# Utiliser la couche personnalisée pour obtenir les embeddings
embeddings = CamembertEmbeddingLayer(camembert_model)([input_ids, attention_mask])

# Utiliser le premier token (CLS) comme représentation de la séquence
cls_token = embeddings[:, 0, :]

# Ajouter une couche dense intermédiaire
intermediate_layer = Dense(32, activation="relu")(cls_token)

# Ajouter la couche de sortie avec une activation sigmoid pour multi-label classification
output = Dense(nb_targets, activation="sigmoid")(intermediate_layer)

# Créer le modèle
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compiler le modèle avec la loss binary_crossentropy
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Fonction pour encoder les textes
def encode_texts(texts, tokenizer, max_seq_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=max_seq_length)
        input_ids.append(encoded["input_ids"][0].numpy())
        attention_masks.append(encoded["attention_mask"][0].numpy())
    return np.array(input_ids), np.array(attention_masks)

# Encoder les données textuelles d'entraînement et de test
X_train_nlp_encoded, X_train_attention_masks = encode_texts(X_train, tokenizer, max_seq_length=max_seq_length)
X_test_nlp_encoded, X_test_attention_masks = encode_texts(X_test, tokenizer, max_seq_length=max_seq_length)

model.summary()

  from .autonotebook import tqdm as notebook_tqdm
Some layers from the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertModel: ['lm_head']
- This IS expected if you are initializing TFCamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFCamembertModel were initialized from the model checkpoint at jplu/tf-camembert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 camembert_embedding_layer   (None, 128, 768)             1106219   ['input_ids[0][0]',           
 (CamembertEmbeddingLayer)                                52         'attention_mask[0][0]']      
                                                                                              

In [7]:
# Entraîner le modèle
model.fit([X_train_nlp_encoded, X_train_attention_masks], y_train, epochs=50, batch_size=32, validation_data=([X_test_nlp_encoded, X_test_attention_masks], y_test))

# Évaluer le modèle
model.evaluate([X_test_nlp_encoded, X_test_attention_masks], y_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.24946370720863342, 0.16326530277729034]

In [8]:
test1 = 'Brocante de Valenciennes - Ceci est une brocante de meubles et de vêtements'


to_test = encode_texts(texts=test1, tokenizer=tokenizer, max_seq_length=max_seq_length)

proba = model.predict(to_test)



In [33]:
import numpy as np

# Fonction pour encoder un seul texte
def encode_single_text(text, tokenizer, max_seq_length):
    encoded = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=max_seq_length)
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]
    return input_ids, attention_mask

# Exemple de texte à tester
test1 = 'balade au musée du Louvre - Venez découvrir les plus belles œuvres du Louvre'

# Encoder le texte
input_ids, attention_mask = encode_single_text(test1, tokenizer, max_seq_length)

# Faire la prédiction
proba = model.predict([input_ids, attention_mask])

# Convertir les probabilités en un tableau numpy
proba = np.squeeze(proba)  # Assurez-vous que la dimension est correcte

# Obtenir les indices des 3 plus grandes probabilités
top_3_indices = np.argsort(proba)[-3:][::-1]

# Afficher les indices des 3 classes avec les plus grandes probabilités
print("Indices des 3 plus grandes probabilités :", top_3_indices)

# Afficher les valeurs des 3 plus grandes probabilités
top_3_probabilities = proba[top_3_indices]
print("Valeurs des 3 plus grandes probabilités :", top_3_probabilities)


Indices des 3 plus grandes probabilités : [23  7 20]
Valeurs des 3 plus grandes probabilités : [0.38642192 0.23983611 0.14258905]


In [34]:
one_hot_combined.columns[top_3_indices]

Index(['Visite', 'Culture', 'Spectacle'], dtype='object')