Notebook Entrainement de modèle depuis les données mongoDB vers MLFLOW

In [18]:
import os
import json
import pymongo
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from pyspark.sql.types import StructType, StructField, StringType, MapType, TimestampType
from transformers import DistilBertTokenizer
from transformers import DistilBertModel
import torch
import torch.nn as nn

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

# Define a UDF to convert the string representation of the array to an actual array of floats
def parse_array(col):
    try:
        return [float(x) for x in col.strip('[]').split(', ')]
    except ValueError:
        return [0.0]*768

parse_array_udf = udf(parse_array, ArrayType(FloatType()))


# Create a Spark session with the MongoDB Spark Connector package
spark = SparkSession.builder \
    .appName("myApp") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()


schema = StructType([
    StructField("user", StringType(), True),
    StructField("repo", StringType(), True),
    StructField("mainLanguage", StringType(), True),
    StructField("languages", MapType(StringType(), StringType()), True),
    StructField("readme", StringType(), True),
    StructField("processed_readme", StringType(), True),
    StructField("last_updated", TimestampType(), True),
])

df = spark.read \
    .format("mongo") \
    .option("database", "dev") \
    .option("collection", "raw_data") \
    .option("uri", "mongodb://mongo:27017/") \
    .schema(schema) \
    .load()

df = df.withColumn("processed_readme", parse_array_udf(df["processed_readme"]))

df.show()

+--------------+-------------+------------+--------------------+--------------------+--------------------+--------------------+
|          user|         repo|mainLanguage|           languages|              readme|    processed_readme|        last_updated|
+--------------+-------------+------------+--------------------+--------------------+--------------------+--------------------+
| julienschmidt|   httprouter|          Go|        {Go -> NULL}|# HttpRouter [![C...|[-0.5059451, -0.3...|2025-03-06 17:27:...|
|     emirpasic|         gods|          Go|        {Go -> NULL}|[![GoDoc](https:/...|[-0.035384115, -0...|2025-03-06 17:27:...|
|       jmoiron|         sqlx|          Go|{Makefile -> NULL...|# sqlx\n\n[![Circ...|[-0.44789732, -0....|2025-03-06 17:27:...|
| TheAlgorithms|           Go|          Go|{Dockerfile -> NU...|# The Algorithms ...|[-0.32194772, -0....|2025-03-06 17:27:...|
|       nats-io|  nats-server|          Go|{Shell -> NULL, P...|<p align="center"...|[-0.52504015, -0...

In [28]:
num_lines = df.count()
print(f"Number of lines in the dataframe: {num_lines}")

Number of lines in the dataframe: 2482


In [29]:
# Charger les données depuis MongoDB
def load_data_from_mongo():
    data = df.select("processed_readme", "mainLanguage").collect()
    texts = [row["processed_readme"] for row in data]
    labels = [row["mainLanguage"] for row in data]
    return torch.tensor(texts), labels


In [30]:
# Charger le tokenizer et le modèle DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")


# Charger et préparer les données
texts, labels = load_data_from_mongo()
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)



In [31]:
# Transformer les textes en embeddings DistilBERT
#document_vectors = encode_texts_with_bert(texts, tokenizer, bert_model)

# Division des données en train/test
#X_train, X_test, y_train, y_test = train_test_split(document_vectors, encoded_labels, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)


In [32]:
#Entraînement du classifieur
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)



In [33]:
#Évaluation du modèle
train_score = classifier.score(X_train, y_train)
test_score = classifier.score(X_test, y_test)

print(f"Train Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")



Train Accuracy: 1.0000
Test Accuracy: 0.4044


In [34]:
# Enregistrement dans MLflow
mlflow.set_tracking_uri("http://mlflow:8080")

with mlflow.start_run():
    mlflow.sklearn.log_model(classifier, "random_forest_model")
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_metric("train_accuracy", train_score)
    mlflow.log_metric("test_accuracy", test_score)
    
    # Save the label mapping to a file and log it as an artifact
    with open("label_mapping.json", "w") as f:
        json.dump(label_encoder.classes_.tolist(), f)
    mlflow.log_artifact("label_mapping.json")

print("Modèle entraîné et enregistré dans MLflow")



🏃 View run redolent-chimp-260 at: http://mlflow:8080/#/experiments/0/runs/4e8d10acf61a48a6aa902113e2da04d2
🧪 View experiment at: http://mlflow:8080/#/experiments/0
Modèle entraîné et enregistré dans MLflow
