In [1]:
import os
import gc
import tensorflow as tf
import transformers
from transformers import TFBertForSequenceClassification, BertTokenizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np
import pandas as pd
import pickle

import mlflow

2024-11-08 10:39:10.176190: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-08 10:39:10.178670: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-08 10:39:10.186171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731062350.199020  132273 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731062350.202741  132273 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 10:39:10.215019: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

In [3]:
# Affichage des versions pour vérification
print("TensorFlow version:", tf.__version__)
print("Transformers version:", transformers.__version__)

TensorFlow version: 2.18.0
Transformers version: 4.46.2


In [4]:
# Recharger le DataFrame depuis le fichier pickle
df_sample = pd.read_pickle('df_sample.pkl')
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 786516 entries, 367103 to 1380859
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  786516 non-null  int64 
 1   ids     786516 non-null  int64 
 2   date    786516 non-null  object
 3   flag    786516 non-null  object
 4   user    786516 non-null  object
 5   text    786516 non-null  object
dtypes: int64(2), object(4)
memory usage: 42.0+ MB


In [5]:
 #Vérifier si le DataFrame a au moins 16 000 lignes
if len(df_sample) < 16000:
    raise ValueError("Le DataFrame contient moins de 16 000 lignes.")

# Calculer la proportion nécessaire pour obtenir 16 000 lignes
sample_size = 16000 / len(df_sample)

# Utiliser train_test_split pour sélectionner un échantillon équilibré de 16 000 lignes
df_16000, _ = train_test_split(df_sample, train_size=sample_size, stratify=df_sample['target'], random_state=42)

# Vérifier le nombre d'éléments et l'équilibre des classes
print(f"Nombre d'échantillons conservés: {len(df_16000)}")
print(df_16000['target'].value_counts(normalize=True))  # Vérifier l'équilibre des classes

df_sample = df_16000

Nombre d'échantillons conservés: 16000
target
1    0.501062
0    0.498937
Name: proportion, dtype: float64


In [6]:

# Charger le tokenizer et le modèle
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


2024-11-08 10:39:13.661566: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Exemple de données pour 'documents' et 'labels'
documents = df_sample['text']  # Liste de textes à analyser
labels = df_sample['target']   # Liste de labels (0 ou 1 pour la classification binaire)


In [8]:
# Tokenization des données
def tokenize_data(documents):
    return tokenizer(
        documents.tolist(),
        max_length=128, padding=True, truncation=True, return_tensors='tf'
    )
tokens = tokenize_data(documents)

In [9]:
# Convertir en NumPy pour train_test_split
input_ids_np = tokens['input_ids'].numpy()
attention_masks_np = tokens['attention_mask'].numpy()
labels_np = np.array(labels)

In [10]:
# Séparer les ensembles de données
train_input_ids, val_input_ids, train_labels, val_labels = train_test_split(input_ids_np, labels_np, test_size=0.95, random_state=42)
train_attention_masks, val_attention_masks = train_test_split(attention_masks_np, test_size=0.95, random_state=42)


In [11]:
# Configurer la fonction de perte et l'optimiseur
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
optimizer = Adam(learning_rate=1e-5)

In [12]:
# Définir le nombre d'époques et la taille du batch
epochs = 5
batch_size = 16

# Définir l'accuracy
train_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy()

In [14]:
# Configuration MLflow
experiment = mlflow.set_experiment("BERT")

with mlflow.start_run(run_name="BERT"):
    # Enregistrement des hyperparamètres
    mlflow.log_param("model_name", "bert-base-uncased")
    mlflow.log_param("num_epochs", epochs)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("learning_rate", 1e-5)
    
    # Boucle d'entraînement
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_accuracy_metric.reset_state()  # Réinitialiser l'accuracy pour chaque epoch
        epoch_loss = []  # Réinitialiser la liste des pertes pour chaque epoch

        for i in range(0, len(train_input_ids), batch_size):
            # Obtenir un batch de données
            batch_input_ids = train_input_ids[i:i + batch_size]
            batch_attention_masks = train_attention_masks[i:i + batch_size]
            batch_labels = train_labels[i:i + batch_size]
            
            with tf.GradientTape() as tape:
                # Faire des prédictions
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_masks,
                    training=True
                )
                logits = outputs.logits
                
                # Calculer la perte
                loss = loss_fn(batch_labels, logits)
            
            # Calculer et appliquer les gradients
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            
            # Mettre à jour l'accuracy et accumuler la perte
            train_accuracy_metric.update_state(batch_labels, logits)
            epoch_loss.append(loss.numpy())
            
            # Afficher la perte et l'accuracy toutes les 10 batches
            if i % (batch_size * 10) == 0:
                train_accuracy = train_accuracy_metric.result().numpy()
                print(f"Batch {i//batch_size} - Loss: {loss.numpy():.4f} - Accuracy: {train_accuracy:.4f}")
                mlflow.log_metric("batch_train_loss", loss.numpy(), step=i//batch_size)
                mlflow.log_metric("batch_train_accuracy", train_accuracy, step=i//batch_size)
        
        # Enregistrement des métriques de l'époque
        epoch_accuracy = train_accuracy_metric.result().numpy()
        epoch_loss_avg = np.mean(epoch_loss)
        print(f"Epoch {epoch + 1} - Loss: {epoch_loss_avg:.4f} - Accuracy: {epoch_accuracy:.4f}")
        mlflow.log_metric("epoch_train_loss", epoch_loss_avg, step=epoch)
        mlflow.log_metric("epoch_train_accuracy", epoch_accuracy, step=epoch)

    # Libérer la mémoire GPU/CPU non utilisée avant l'évaluation
    gc.collect()
    tf.keras.backend.clear_session()

MlflowException: API request to http://mlflow-server:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='mlflow-server', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=BERT (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7fc984461950>: Failed to resolve 'mlflow-server' ([Errno -2] Name or service not known)"))

In [None]:
# Évaluation du modèle sur le set de validation en mini-batches
batch_size = 8  # Taille réduite pour l'évaluation
num_batches = len(val_input_ids) // batch_size
all_predictions = []

for i in range(num_batches):
    # Obtenir un batch de validation
    batch_input_ids = val_input_ids[i * batch_size : (i + 1) * batch_size]
    batch_attention_masks = val_attention_masks[i * batch_size : (i + 1) * batch_size]

    # Calcul des logits pour le batch
    batch_logits = model(
        input_ids=batch_input_ids,
        attention_mask=batch_attention_masks,
        training=False
    ).logits

    # Stocker les prédictions
    batch_predictions = tf.argmax(batch_logits, axis=1)
    all_predictions.append(batch_predictions)

# Concaténer toutes les prédictions
all_predictions = tf.concat(all_predictions, axis=0)

# Calcul de l'accuracy
accuracy = tf.reduce_mean(tf.cast(all_predictions == val_labels[:len(all_predictions)], dtype=tf.float32))
print(f"Validation Accuracy: {accuracy.numpy():.4f}")
# Log de l'accuracy finale de validation dans MLflow
mlflow.log_metric("val_accuracy", accuracy.numpy())

# Enregistrement du modèle
mlflow.keras.log_model(model, "bert_model")