# Notebook pour l'approche supervisée
Ce notebook contient la génération de trois modèles, BoW, Word2Vec et use sentence encoder


In [30]:
import os
import joblib
import logging
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow_hub as hub
from dotenv import load_dotenv


In [31]:

# Variables d'environnement et configuration MLflow
load_dotenv()
tracking_uri = "https://mlflowp51-975919512217.us-central1.run.app"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Text_Processing_Experiment")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "f8bc1d91ca98.json"


2024/10/13 00:31:36 INFO mlflow.tracking.fluent: Experiment with name 'Text_Processing_Experiment' does not exist. Creating a new experiment.


In [32]:

def load_mlflow_artifact(artifact_path):
    local_path = mlflow.artifacts.download_artifacts(artifact_path)
    with open(local_path, 'rb') as f:
        return joblib.load(f)

# X et y sauvegardés dans MLflow
X_reduced = load_mlflow_artifact("mlflow_artifacts/X_reduced.pkl")
X_word2vec = load_mlflow_artifact("mlflow_artifacts/X_word2vec.pkl")
X_use_np = load_mlflow_artifact("mlflow_artifacts/X_use_embeddings.pkl")
y = load_mlflow_artifact("mlflow_artifacts/y.pkl")

print("Les artefacts X et y ont été chargés avec succès depuis MLflow.")


Les artefacts X et y ont été chargés avec succès depuis MLflow.


In [33]:

def load_model_objects():
    with open('mlflow_artifacts/vectorizer.pkl', 'rb') as f:
        vectorizer = joblib.load(f)
    with open('mlflow_artifacts/svd.pkl', 'rb') as f:
        svd = joblib.load(f)
    with open('mlflow_artifacts/top_tags.pkl', 'rb') as f:
        top_tags = joblib.load(f)
    return vectorizer, svd, top_tags

vectorizer, svd, top_tags = load_model_objects()


In [34]:

def build_model(input_shape, output_shape):
    model = Sequential([
    Dense(256, activation='relu', input_shape=(input_shape,)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(output_shape, activation='sigmoid') 
    ])
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


In [35]:

def train_model(model, X_train, y_train, X_val, y_val, epochs=10):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=32,
                        validation_data=(X_val, y_val), callbacks=[early_stopping])
    return history


In [36]:

def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Loss: {loss}, Accuracy: {accuracy}")
    return loss, accuracy


In [37]:

def transform_text_to_bow(text, vectorizer, svd):
    X_bow = vectorizer.transform([text])
    return svd.transform(X_bow)


In [38]:

def transform_text_to_word2vec(text, word2vec_model):
    tokens = text.split()
    word_vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0).reshape(1, -1)  
    else:
        return np.zeros((1, 300)) 
# Réenregistrer le modèle Word2Vec pour garantir la compatibilité
def reload_and_save_word2vec_model(model_path, new_model_path):
    # Charge l'ancien modèle
    word2vec_model = Word2Vec.load(model_path)

    # Sauvegarde le modèle avec la version actuelle de gensim
    word2vec_model.save(new_model_path)

    print(f"Le modèle Word2Vec a été rechargé et sauvegardé sous: {new_model_path}")
    return word2vec_model

# Appele la fonction pour réenregistrer le modèle
new_word2vec_model_path = "mlflow_artifacts/new_word2vec_model.model"
word2vec_model = reload_and_save_word2vec_model("mlflow_artifacts/word2vec_model.model", new_word2vec_model_path)


Le modèle Word2Vec a été rechargé et sauvegardé sous: mlflow_artifacts/new_word2vec_model.model


In [39]:

def transform_text_to_use(text):
    try:
        use_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=False)
        input_tensor = tf.convert_to_tensor([text], dtype=tf.string)
        embeddings = use_layer(input_tensor)
        embeddings_np = embeddings.numpy()
        logging.info(f"Embeddings générés pour le texte '{text}': {embeddings_np}")
        return embeddings_np
    except Exception as e:
        logging.error(f"Erreur lors de la transformation du texte en embeddings USE : {str(e)}")
        raise e



### Model BOW avec entrée données traitées par BoW + SVD

In [40]:

# Modèle BoW + SVD
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

model_bow_svd = build_model(X_train.shape[1], y_train.shape[1])

# Entraînement et évaluation du modèle BoW + SVD
history_bow_svd = train_model(model_bow_svd, X_train, y_train, X_test, y_test)
loss_bow_svd, accuracy_bow_svd = evaluate_model(model_bow_svd, X_test, y_test)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1140 - loss: 0.4680 - val_accuracy: 0.3597 - val_loss: 0.2335
Epoch 2/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.2987 - loss: 0.2538 - val_accuracy: 0.4494 - val_loss: 0.1936
Epoch 3/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3873 - loss: 0.2094 - val_accuracy: 0.4654 - val_loss: 0.1770
Epoch 4/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4216 - loss: 0.1965 - val_accuracy: 0.4859 - val_loss: 0.1687
Epoch 5/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4431 - loss: 0.1804 - val_accuracy: 0.5125 - val_loss: 0.1616
Epoch 6/10
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4689 - loss: 0.1740 - val_accuracy: 0.5171 - val_loss: 0.1581
Epoch 7/10
[1m165/165[0m [32m━━━━━━━

In [41]:

# Suivi dans MLflow
with mlflow.start_run(run_name="BoW+SVD Model"):
    mlflow.log_param("mlflow_artifacts", "BoW+SVD")
    mlflow.log_metric("loss", loss_bow_svd)
    mlflow.log_metric("accuracy", accuracy_bow_svd)

    # Sauvegarde du modèle en .h5
    model_bow_svd_path = os.path.join("mlflow_artifacts", 'bow_svd_model.h5')
    model_bow_svd.save(model_bow_svd_path)
    
    # Suivi dans MLflow
    mlflow.keras.log_model(model=model_bow_svd, artifact_path="bow_svd_model")
    mlflow.log_artifact(model_bow_svd_path)  # Enregistre le fichier .h5 dans MLflow

Exception: Run with UUID 632c8cb2004f4f76aac4701f30b3e6d0 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [24]:

save_path = "mlflow_artifacts"

# Charge le modèle Word2Vec correctement
word2vec_model = Word2Vec.load("mlflow_artifacts/word2vec_model.model")

# Modèle Word2Vec
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

model_word2vec = build_model(input_shape=300, output_shape= y_train_w2v.shape[1])

# Entraînement et évaluation du modèle Word2Vec
history_word2vec = train_model(model_word2vec, X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v)
loss_word2vec, accuracy_word2vec = evaluate_model(model_word2vec, X_test_w2v, y_test_w2v)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2342 - loss: 0.2832 - val_accuracy: 0.5204 - val_loss: 0.1491
Epoch 2/10
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4461 - loss: 0.1661 - val_accuracy: 0.5410 - val_loss: 0.1374
Epoch 3/10
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4958 - loss: 0.1531 - val_accuracy: 0.5395 - val_loss: 0.1333
Epoch 4/10
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5022 - loss: 0.1487 - val_accuracy: 0.5519 - val_loss: 0.1311
Epoch 5/10
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5160 - loss: 0.1432 - val_accuracy: 0.5559 - val_loss: 0.1279
Epoch 6/10
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5340 - loss: 0.1400 - val_accuracy: 0.5565 - val_loss: 0.1285
Epoch 7/10
[1m412/412[0m [32m━━━━━━━

In [25]:
with mlflow.start_run(run_name="Word2Vec Model"):
    mlflow.log_param("mlflow_artifacts", "Word2Vec")
    mlflow.log_metric("loss", loss_word2vec)
    mlflow.log_metric("accuracy", accuracy_word2vec)

    # Sauvegarde du modèle en .h5
    model_word2vec_path = os.path.join("mlflow_artifacts", 'word2vec_model.h5')
    model_word2vec.save(model_word2vec_path)
    
    # Suivi dans MLflow
    mlflow.keras.log_model(model=model_word2vec, artifact_path="word2vec_model")
    mlflow.log_artifact(model_word2vec_path)


2024/10/12 19:51:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run Word2Vec Model at: https://mlflowp51-975919512217.us-central1.run.app/#/experiments/1/runs/973cbf58b07a4f5bba9262a8cf6cdd62.
2024/10/12 19:51:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflowp51-975919512217.us-central1.run.app/#/experiments/1.


In [26]:

# Modèle Universal Sentence Encoder (USE)
X_train_use, X_test_use, y_train_use, y_test_use = train_test_split(X_use_np, y, test_size=0.2, random_state=42)

# Charge le modèle USE
use_model = tf.keras.models.load_model("mlflow_artifacts/use_model.h5", compile=False)

# Recompile le modèle avec un nouvel optimiseur
use_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Évaluation du modèle USE (sans réentraîner)
loss_use, accuracy_use = evaluate_model(use_model, X_test_use, y_test_use)


[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 588us/step - accuracy: 0.2050 - loss: 0.4720
Loss: 0.4719560742378235, Accuracy: 0.20413123071193695


In [27]:

# Suivi dans MLflow
with mlflow.start_run(run_name="USE Model"):
    mlflow.log_param("model", "USE (Chargé et réévalué)")
    mlflow.log_metric("loss", loss_use)
    mlflow.log_metric("accuracy", accuracy_use)

    # Sauvegarde du modèle en .h5
    model_use_path = os.path.join("mlflow_artifacts", 'use_model.h5')
    use_model.save(model_use_path)
    
    # Suivi dans MLflow
    mlflow.keras.log_model(model=use_model, artifact_path="use_model")
    mlflow.log_artifact(model_use_path)


2024/10/12 19:51:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run USE Model at: https://mlflowp51-975919512217.us-central1.run.app/#/experiments/1/runs/2987612a9e9e4347ae90b4b3ba26d355.
2024/10/12 19:51:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflowp51-975919512217.us-central1.run.app/#/experiments/1.


In [28]:

def predict_tags(model, vector, top_tags, threshold=0.01):
    predictions = model.predict(vector)

    # Associe les tags avec les probabilités
    tag_probabilities = [(top_tags[i], float(predictions[0][i])) for i in range(len(top_tags))]
    
    # Trie les tags par probabilité décroissante
    sorted_tag_probabilities = sorted(tag_probabilities, key=lambda x: x[1], reverse=True)
    
    # Applique le seuil de probabilité pour filtrer les tags
    predicted_tags = [tag for tag, prob in sorted_tag_probabilities if prob >= threshold]
    
    logging.info(f"Tags triés par probabilité décroissante: {sorted_tag_probabilities}")
    
    return predicted_tags


In [29]:

# Prédiction avec BoW + SVD
new_question_text = "python"
new_question_vector_bow = transform_text_to_bow(new_question_text, vectorizer, svd)
predicted_tags_bow_svd = predict_tags(model_bow_svd, new_question_vector_bow, top_tags)
print("BoW+SVD Model - Suggested Tags:", predicted_tags_bow_svd)

# Prédiction avec Word2Vec
new_question_text_w2v = "javascript tutorial"
new_question_vector_w2v = transform_text_to_word2vec(new_question_text_w2v, word2vec_model)
predicted_tags_w2v = predict_tags(model_word2vec, new_question_vector_w2v, top_tags)
print("Word2Vec Model - Suggested Tags:", predicted_tags_w2v)

# Prédiction avec USE
new_question_text_use = "how to learn machine learning"
new_question_vector_use = transform_text_to_use(new_question_text_use)
predicted_tags_use = predict_tags(use_model, new_question_vector_use, top_tags)
print("USE Model - Suggested Tags:", predicted_tags_use)

# Chemin global pour sauvegarder les artefacts des modèles
save_path = '../mlruns/artifacts'
os.makedirs(save_path, exist_ok=True)

# Sauvegarde des modèles et des artefacts
with open(os.path.join(save_path, 'X_use_embeddings.pkl'), 'wb') as f:
    joblib.dump(X_use_np, f)
mlflow.log_artifact(os.path.join(save_path, 'X_use_embeddings.pkl'))

with open(os.path.join(save_path, 'X_word2vec.pkl'), 'wb') as f:
    joblib.dump(X_word2vec, f)
mlflow.log_artifact(os.path.join(save_path, 'X_word2vec.pkl'))

with open(os.path.join(save_path, 'X_reduced.pkl'), 'wb') as f:
    joblib.dump(X_reduced, f)
mlflow.log_artifact(os.path.join(save_path, 'X_reduced.pkl'))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
BoW+SVD Model - Suggested Tags: ['python', 'python-3.x', 'amazon-web-services', 'javascript', 'azure', 'java', 'android']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Word2Vec Model - Suggested Tags: ['c#', 'javascript', 'next.js', 'reactjs', 'node.js', 'angular', 'typescript']


















[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
USE Model - Suggested Tags: ['python', 'azure', 'react-native', 'node.js', 'docker', 'reactjs', 'c#', 'java', 'next.js', 'angular', 'flutter', 'css', 'typescript', 'kotlin', 'javascript', 'amazon-web-services', 'ios', 'android', 'spring-boot', 'python-3.x']
