Notebook Entrainement de modèle depuis les données mongoDB vers MLFLOW

In [None]:
import os
import json
import pymongo
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from pyspark.sql.types import StructType, StructField, StringType, MapType, TimestampType


In [None]:
from pyspark.sql import SparkSession

# Create a Spark session with the MongoDB Spark Connector package
spark = SparkSession.builder \
    .appName("myApp") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()


schema = StructType([
    StructField("user", StringType(), True),
    StructField("repo", StringType(), True),
    StructField("mainLanguage", StringType(), True),
    StructField("languages", MapType(StringType(), StringType()), True),
    StructField("readme", StringType(), True),
    StructField("processed_readme", StringType(), True),
    StructField("last_updated", TimestampType(), True),
])

df = spark.read \
    .format("mongo") \
    .option("database", "dev") \
    .option("collection", "raw_data") \
    .option("uri", "mongodb://mongo:27017/") \
    .schema(schema) \
    .load()

df.show()

+--------------------+--------------------+--------------------+--------------------+----------------+------------+
|                user|                repo|           languages|              readme|processed_readme|last_updated|
+--------------------+--------------------+--------------------+--------------------+----------------+------------+
|               folke|        trouble.nvim|{Shell -> 117, Lu...|# 🚦 Trouble\n\nA...|            NULL|        NULL|
|               folke|      which-key.nvim|{Shell -> 119, Lu...|# 💥 Which Key\n\...|            NULL|        NULL|
|              lsyncd|              lsyncd|{Shell -> 1275, L...|Lsyncd -- Live Sy...|            NULL|        NULL|
|           lewis6991|       gitsigns.nvim|{Makefile -> 2663...|# gitsigns.nvim\n...|            NULL|        NULL|
|            LunarVim| Neovim-from-scratch|      {Lua -> 58070}|# Neovim from scr...|            NULL|        NULL|
|             lite-xl|             lite-xl|{Shell -> 18333, ...|# Lite XL\

In [19]:
num_lines = df.count()
print(f"Number of lines in the dataframe: {num_lines}")

Number of lines in the dataframe: 9615


In [25]:
# Connexion à MongoDB
client = pymongo.MongoClient("mongodb://mongo:27017/")  
db = client.get_database("dev")
collection = db.get_collection("raw_data")

In [26]:
# Vérifier si la base de données existe
db_list = client.list_database_names()
if "dev" in db_list:
    print("La base de données existe.")
else:
    print("La base de données n'existe pas.")

La base de données existe.


In [None]:
# Charger les données depuis MongoDB
def load_data_from_mongo():
    data = list(collection.find())
    texts = [item.get("readme_clean", "") for item in data]
    labels = [item.get("mainLanguage", "") for item in data]
    return texts, labels


In [None]:
# Charger et préparer les données
texts, labels = load_data_from_mongo()
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenisation des textes
tokenized_texts = [word_tokenize(text) for text in texts]



In [None]:
# Entraînement du modèle Word2Vec
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)



In [None]:
# Fonction pour vectoriser les documents
def vectorize_documents(tokenized_texts, model):
    document_vectors = []
    for tokens in tokenized_texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            document_vectors.append(np.mean(vectors, axis=0))
        else:
            document_vectors.append(np.zeros(model.vector_size))
    return document_vectors

document_vectors = vectorize_documents(tokenized_texts, word2vec_model)



In [None]:
# Division des données en train/test
X_train, X_test, y_train, y_test = train_test_split(document_vectors, encoded_labels, test_size=0.2, random_state=42)



In [None]:
# Entraînement du classifieur
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [None]:
# Configuration de MLflow
mlflow.set_tracking_uri("http://localhost:8090")

# Enregistrement du modèle dans MLflow
with mlflow.start_run():
    mlflow.sklearn.log_model(classifier, "random_forest_model")
    mlflow.log_param("model_type", "RandomForestClassifier")

print("Modèle entraîné et enregistré dans MLflow !")