Première approche avec du Word2Vec et RandomForestClassifier

1. Nettoyage des données 

In [1]:
import os
import re
import json

def make_clean_text(text):
    """Nettoie un texte brut pour le prétraitement."""
    # Supprimer les balises HTML, les URL, et tout caractère non alphabétique
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"<.*?>", "", text)  # Supprimer les balises HTML
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Garder uniquement les lettres et espaces
    text = text.lower().strip()  # Convertir en minuscules et supprimer les espaces inutiles
    return text

def clean_json_files(directory_path):
    """Parcourt tous les fichiers JSON dans un répertoire, nettoie les données, et les met à jour."""
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Ajouter un champ `readme_clean` nettoyé pour chaque entrée
            for repo, content in data.items():
                if 'readme' in content:
                    content['readme_clean'] = make_clean_text(content['readme'])

            # Réécrire le fichier nettoyé
            with open(file_path, 'w', encoding='utf-8') as file:
                json.dump(data, file, ensure_ascii=False, indent=4)

# Exemple d'utilisation
directory_path = 'data_json'
clean_json_files(directory_path)


2. Chargement et préparation des données

In [2]:
from sklearn.preprocessing import LabelEncoder

def load_and_prepare_data(directory_path):
    """Charge les fichiers JSON et prépare les données pour la vectorisation."""
    combined_data = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                combined_data.update(data)

    # Extraire les champs pertinents
    texts = [content.get('readme_clean', '') for content in combined_data.values()]
    labels = [content['mainLanguage'] for content in combined_data.values()]

    # Encodage des labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    return texts, encoded_labels, label_encoder

# Charger et préparer les données
texts, labels, label_encoder = load_and_prepare_data(directory_path)


3. Entraîner un Modèle Word2Vec

In [3]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokeniser les textes
tokenized_texts = [word_tokenize(text) for text in texts]

# Entraîner le modèle Word2Vec
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Fonction pour obtenir un vecteur moyen par document
def vectorize_documents(tokenized_texts, model):
    """Vectorise une liste de documents tokenisés en utilisant Word2Vec."""
    document_vectors = []
    for tokens in tokenized_texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            document_vectors.append(sum(vectors) / len(vectors))
        else:
            document_vectors.append([0] * model.vector_size)  # Vecteur nul si aucun mot n'est trouvé
    return document_vectors

# Vectoriser les documents
document_vectors = vectorize_documents(tokenized_texts, word2vec_model)


4. Diviser en Enseignement et Test

In [4]:
from sklearn.model_selection import train_test_split

# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(document_vectors, labels, test_size=0.2, random_state=42)


5. Entraîner un Classifieur

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Entraîner le classifieur
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


6. Évaluer le Modèle

In [6]:
from sklearn.metrics import accuracy_score, classification_report

# Prédire les labels
y_pred = classifier.predict(X_test)

# Afficher les résultats
print("Accuracy:", accuracy_score(y_test, y_pred))
# Afficher le rapport de classification
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=label_encoder.classes_[:len(set(y_test))]))


Accuracy: 0.20454545454545456

Classification Report:
               precision    recall  f1-score   support

           C       0.00      0.00      0.00         0
        Dart       0.00      0.00      0.00         0
          Go       0.00      0.00      0.00         0
        Java       0.00      0.00      0.00         0
  JavaScript       0.00      0.00      0.00         0
      Kotlin       0.00      0.00      0.00         0
         Lua       0.00      0.00      0.00         0
      MATLAB       0.00      0.00      0.00         0
         PHP       0.00      0.00      0.00         0
        Perl       0.00      0.00      0.00         0
      Python       0.00      0.00      0.00         0
           R       0.00      0.00      0.00         0
        Ruby       0.00      0.00      0.00         0
        Rust       0.00      0.00      0.00         0
       Scala       0.00      0.00      0.00         0
       Shell       0.00      0.00      0.00         0
       Swift       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.linear_model import LogisticRegression

# Entraîner le classifieur Logistic Regression
log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)
log_reg_classifier.fit(X_train, y_train)

# Prédire les labels
y_pred = log_reg_classifier.predict(X_test)

# Afficher les résultats
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.21212121212121213

Classification Report:
               precision    recall  f1-score   support

           C       0.27      0.44      0.33         9
        Dart       0.31      0.40      0.35        10
          Go       0.17      0.27      0.21        11
        Java       0.36      0.40      0.38        10
  JavaScript       0.08      0.25      0.12         8
      Kotlin       0.00      0.00      0.00        10
         Lua       0.14      0.12      0.13         8
      MATLAB       0.00      0.00      0.00         6
         PHP       0.00      0.00      0.00         7
        Perl       0.00      0.00      0.00         2
      Python       0.00      0.00      0.00         8
           R       0.00      0.00      0.00         5
        Ruby       0.00      0.00      0.00         8
        Rust       0.33      0.33      0.33         3
       Scala       0.20      0.33      0.25         3
       Shell       0.30      0.38      0.33        16
       Swift       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import mlflow
import mlflow.sklearn

# Configurer MLflow pour suivre les expériences dans le conteneur Docker
mlflow.set_tracking_uri("http://localhost:8090")

# Enregistrer le modèle RandomForestClassifier
with mlflow.start_run():
    mlflow.sklearn.log_model(classifier, "random_forest_model")
    mlflow.log_param("model_type", "RandomForestClassifier")

# Enregistrer le modèle LogisticRegression
with mlflow.start_run():
    mlflow.sklearn.log_model(log_reg_classifier, "log_reg_model")
    mlflow.log_param("model_type", "LogisticRegression")




In [11]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("http://localhost:8090")

# Enregistrer les modèles avec un run parent
with mlflow.start_run(run_name="model_training") as parent_run:

    # RandomForestClassifier
    with mlflow.start_run(run_name="random_forest", nested=True):
        mlflow.sklearn.log_model(classifier, "random_forest_model")
        mlflow.log_param("model_type", "RandomForestClassifier")

    # LogisticRegression
    with mlflow.start_run(run_name="log_reg", nested=True):
        mlflow.sklearn.log_model(log_reg_classifier, "log_reg_model")
        mlflow.log_param("model_type", "LogisticRegression")


In [12]:
import mlflow
from sklearn.ensemble import RandomForestClassifier

# Train your model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Log the model
mlflow.sklearn.log_model(clf, 'model')

<mlflow.models.model.ModelInfo at 0x19fb0ad3790>

In [15]:
mlflow.sklearn.log_model(clf, "oui")


<mlflow.models.model.ModelInfo at 0x19f9f6abac0>

In [17]:
import mlflow.sklearn

# Remplace <RUN_ID> par l'ID du run où "oui" est enregistré
model = mlflow.sklearn.load_model(f"runs:/5fefbe6797a24f0fafc544605ae42970/oui")

import numpy as np

test_input = np.random.rand(1, model.n_features_in_)  # Crée un input avec le bon nombre de colonnes
prediction = model.predict(test_input)

print("Prédiction :", prediction)


Prédiction : [1]


In [9]:
import mlflow

# List all runs in the default experiment
experiment_id = mlflow.get_experiment_by_name("Default").experiment_id
runs = mlflow.search_runs(experiment_ids=[experiment_id])

# Affiche les détails des runs
print(runs[['run_id', 'status', 'start_time', 'end_time']])


                             run_id    status  \
0  488f4afcbc1b46729f39b792c87bb08d  FINISHED   
1  892579609c7d4e88be59f91b01c4a0bd  FINISHED   

                        start_time                         end_time  
0 2025-02-13 10:04:12.932000+00:00 2025-02-13 10:04:18.035000+00:00  
1 2025-02-13 10:04:01.964000+00:00 2025-02-13 10:04:12.907000+00:00  


In [10]:
# Exemple pour charger un modèle depuis un run spécifique
run_id = '488f4afcbc1b46729f39b792c87bb08d'  # Remplacez par l'ID du run que vous souhaitez inspecter
logged_model = f'runs:/{run_id}/random_forest_model'

# Charger le modèle
model = mlflow.sklearn.load_model(logged_model)

# Vérifier si le modèle est chargé correctement
print(model)


OSError: No such file or directory: '\mlflow\artifacts\0\488f4afcbc1b46729f39b792c87bb08d\artifacts\random_forest_model'

In [17]:
import joblib

# Enregistrer le modèle LogisticRegression
joblib.dump(log_reg_classifier, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [18]:
# Enregistrer le modèle Word2Vec
word2vec_model.save('word2vec_model.bin')


In [8]:
import joblib

joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']