In [None]:
import mlflow.sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import mlflow

data = pd.read_csv('data/tweets_cleaned.csv', encoding='latin-1')

In [None]:
data.head()
print(data['target'].unique())
data['target'] = data['target'].astype(int)
print(data.dtypes)

data['target'].value_counts()

[0 1]
target     int64
text      object
dtype: object


target
0    800000
1    800000
Name: count, dtype: int64

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, model_name, run_id):
    """
    Génère, affiche et enregistre une matrice de confusion.
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Négatif", "Positif"], yticklabels=["Négatif", "Positif"])
    plt.xlabel("Prédictions")
    plt.ylabel("Vraies valeurs")
    plt.title(f"Matrice de Confusion - {model_name}")

    # Affichage dans le notebook
    plt.show()

    # Sauvegarde de l'image
    filename = f"confusion_matrix_{model_name}_run{run_id}.png"
    plt.savefig(filename)
    plt.close()

    return filename


# TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score,  roc_auc_score



X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

# hyperparamètres à tester
param_grid = {
    "max_features": [5000, 10000, 20000],
    "ngram_range": [(1, 1), (1, 2), (1, 3)]
}


mlflow.set_experiment("TF-IDF + Logistic Regression")


for max_features in param_grid["max_features"]:
    for ngram_range in param_grid["ngram_range"]:
        # TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)

       
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)

        # régression logistique
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train_tfidf, y_train)

        
        y_pred = model.predict(X_test_tfidf)
        y_prob = model.predict_proba(X_test)[:, 1]  # Probabilité positive pour roc_auc
       
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)

        # run MLflow
        with mlflow.start_run(run_name=f"max_features={max_features}, ngram_range={ngram_range}"):
                    mlflow.log_param("max_features", max_features)
                    mlflow.log_param("ngram_range", ngram_range)
                    mlflow.log_metric("accuracy", accuracy)
                    mlflow.log_metric("f1_score", f1)
                    mlflow.log_metric("roc_auc", roc_auc)

                    #  Génération et enregistrement de la matrice de confusion
                    print(f"\n Matrice de Confusion pour max_features={max_features}, ngram_range={ngram_range}:")
                    cm_filename = plot_confusion_matrix(y_test, y_pred, "TF-IDF_GridSearch", f"{max_features}_{ngram_range}")
                    mlflow.log_artifact(cm_filename)
                    
                    mlflow.sklearn.log_model(model, "logistic_regression_model")

                    print(f"Run enregistré : max_features={max_features}, ngram_range={ngram_range}, ROC AUC: {roc_auc}")


2025/01/06 17:10:24 INFO mlflow.tracking.fluent: Experiment with name 'TF-IDF + Logistic Regression' does not exist. Creating a new experiment.


Run enregistré : max_features=5000, ngram_range=(1, 1)




Run enregistré : max_features=5000, ngram_range=(1, 2)




Run enregistré : max_features=5000, ngram_range=(1, 3)




Run enregistré : max_features=10000, ngram_range=(1, 1)




Run enregistré : max_features=10000, ngram_range=(1, 2)




Run enregistré : max_features=10000, ngram_range=(1, 3)




Run enregistré : max_features=20000, ngram_range=(1, 1)




Run enregistré : max_features=20000, ngram_range=(1, 2)




Run enregistré : max_features=20000, ngram_range=(1, 3)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn


# pipeline TF-IDF et Régression Logistique
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(max_iter=1000))
])


param_grid = {
    'tfidf__max_features': [5000, 10000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.95, 0.90],
    'logreg__C': [0.01, 0.1, 1, 10]
}


mlflow.set_experiment("GridSearch TF-IDF + Logistic Regression")

# GridSearch
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', verbose=2)


grid_search.fit(X_train, y_train)


for i, params in enumerate(grid_search.cv_results_['params']):
    with mlflow.start_run(run_name=f"Run_{i+1}"):
        # enregistrer les hyperparamètres
        mlflow.log_param("max_features", params['tfidf__max_features'])
        mlflow.log_param("ngram_range", params['tfidf__ngram_range'])
        mlflow.log_param("max_df", params['tfidf__max_df'])
        mlflow.log_param("C", params['logreg__C'])

        y_pred = grid_search.best_estimator_.predict(X_test)
        y_prob = grid_search.best_estimator_.predict_proba(X_test)[:, 1]  # Probabilité positive pour roc_auc

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        
        # Enregistrer dans MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)

        #  Générer et enregistrer la matrice de confusion
        print(f"\n Matrice de Confusion pour Run {i+1} :")
        cm_filename = plot_confusion_matrix(y_test, y_pred, "GridSearch", i+1)
        mlflow.log_artifact(cm_filename)

        print(f"Run {i+1} enregistré avec les hyperparamètres : {params}")
        print(f"Accuracy: {accuracy}, F1 Score: {f1}, ROC AUC: {roc_auc}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=  10.7s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=  10.7s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=  11.3s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  24.7s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  22.8s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  22.6s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   9.9s
[CV] END logreg__C=0.01, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   9.9s


# Word embedding

In [None]:
from gensim.models import Word2Vec, FastText
# Entraîner les modèles Word2Vec et FastText 
w2v_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=2, workers=4)
ft_model = FastText(sentences=X_train, vector_size=100, window=5, min_count=2, workers=4)


In [None]:
import numpy as np


def vectorize_text(tokens, model):
    vector_size = model.vector_size
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Transformer les tweets avec les embeddings
X_train_w2v = np.array([vectorize_text(tweet, w2v_model) for tweet in X_train])
X_test_w2v = np.array([vectorize_text(tweet, w2v_model) for tweet in X_test])

X_train_ft = np.array([vectorize_text(tweet, ft_model) for tweet in X_train])
X_test_ft = np.array([vectorize_text(tweet, ft_model) for tweet in X_test])


In [None]:
from sklearn.discriminant_analysis import StandardScaler


scaler = StandardScaler()
X_train_w2v = scaler.fit_transform(X_train_w2v)
X_test_w2v = scaler.transform(X_test_w2v)

X_train_ft = scaler.fit_transform(X_train_ft)
X_test_ft = scaler.transform(X_test_ft)


In [None]:
def train_and_log_model(X_train, X_test, y_train, y_test, model_name="Word2Vec"):
    with mlflow.start_run(run_name=f"LogisticRegression_{model_name}"):
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)

        mlflow.log_param("embedding", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)


        # Afficher la matrice de confusion dans le notebook
        print(f"\n Matrice de Confusion pour Run {i+1} :")
        cm_filename = plot_confusion_matrix(y_test, y_pred, "GridSearch", i+1)
        mlflow.log_artifact(cm_filename)

        mlflow.sklearn.log_model(model, f"LogisticRegression_{model_name}")

        print(f"{model_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")



train_and_log_model(X_train_w2v, X_test_w2v, y_train, y_test, model_name="Word2Vec")
train_and_log_model(X_train_ft, X_test_ft, y_train, y_test, model_name="FastText")


# 2 modeles deeplearning


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Paramètres
MAX_NB_WORDS = 20000  # Nombre maximal de mots dans le vocabulaire
MAX_SEQUENCE_LENGTH = 50  # Longueur maximale des séquences


tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(data["text"]) 

# Conversion  tokens en indices
X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_pad = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Conversion  array
y_train = np.array(y_train)
y_test = np.array(y_test)

print("Données préparées pour LSTM avec padding")


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional

def create_lstm_model():
    model = Sequential([
        Embedding(input_dim=MAX_NB_WORDS, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),
        SpatialDropout1D(0.2),
        Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

lstm_model = create_lstm_model()
lstm_model.summary()


In [None]:
import mlflow
import mlflow.tensorflow
import numpy as np

with mlflow.start_run(run_name="LSTM_Model"):
    history = lstm_model.fit(
        X_train_pad, y_train,
        validation_data=(X_test_pad, y_test),
        epochs=5,
        batch_size=64,
        verbose=1
    )

    # Prédictions
    y_pred = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
    y_prob = lstm_model.predict(X_test_pad)[:, 0]  # Probabilité pour le ROC AUC

    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    # Enregistrer les métriques demandées dans MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("roc_auc", roc_auc)

    # Générer et enregistrer la matrice de confusion pour LSTM
    cm_filename = plot_confusion_matrix(y_test, y_pred, "LSTM", "1")
    mlflow.log_artifact(cm_filename)

    print(f"LSTM Model enregistré : Accuracy={accuracy:.4f}, F1={f1:.4f}, ROC AUC={roc_auc:.4f}")

