In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.2 MB/s[0m eta [36m0:00:

In [90]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [91]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
# Load and preprocess your dataset (replace 'your_dataset.csv' with your data)
data = pd.read_csv('/content/data_with_predictions.csv')
data = data.dropna()
texts = data['commentaire'].values
labels = data['predicted_problems'].values  # Replace 'label' with the name of your label column
le = LabelEncoder()
labels = le.fit_transform(labels)


In [93]:
batch_size = 16  # or 32
MAX_LENGTH = 128  # Reduce the value


In [94]:
# Tokenize and pad the text data
input_ids = []
attention_masks = []
for text in texts:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)

In [95]:


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    input_ids,
    labels,
    test_size=0.2,
    random_state=42
)

In [96]:
# Define the model architecture
input_ids_input = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='input_ids')
attention_masks_input = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='attention_mask')
bert_output = model(input_ids_input, attention_mask=attention_masks_input)[0]
output = tf.keras.layers.Dense(6, activation='softmax')(bert_output)

model = tf.keras.Model(inputs=[input_ids_input, attention_masks_input], outputs=output)


In [97]:
# Compiler le modèle
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:

# Train the model
history = model.fit(
    [X_train, X_train],  # Input data
    y_train,             # Labels
    epochs=4,       # Number of training epochs
    batch_size=16,  # Batch size
    validation_data=([X_test, X_test], y_test),  # Validation data
    verbose=1            # Verbosity mode
)


Epoch 1/4

In [88]:
# Evaluate the model
loss, accuracy = model.evaluate([X_test, X_test], y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Test loss: nan
Test accuracy: 0.0000


In [89]:
# Define your comment
new_comment = "Trop mauvais service,trop de bruit."

# Tokenize and pad the comment
encoded_comment = tokenizer.encode_plus(
    new_comment,
    add_special_tokens=True,
    max_length=MAX_LENGTH,
    padding='max_length',
    return_attention_mask=True,
    truncation=True
)

input_ids_comment = np.array([encoded_comment['input_ids']])
attention_masks_comment = np.array([encoded_comment['attention_mask']])
# Make predictions
predictions = model.predict([input_ids_comment, attention_masks_comment])

# Convert the predictions to class indices
predicted_class_index = np.argmax(predictions, axis=1)
predicted_class_label = le.inverse_transform(predicted_class_index)[0]
print("Comment:", new_comment)
print("Predicted Class Label:", predicted_class_label)


Comment: Trop mauvais service,trop de bruit.
Predicted Class Label: ["Problèmes d'Équipements et d'Installations", 'Bruits', 'Problèmes de Prix', 'Retard']
