In [1]:
# !python preprocess.ipynb

In [1]:
import os
import configparser
import pickle
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
config = configparser.ConfigParser()
config.read('config3.ini')
RANDOM_STATE = config.getint('Default', 'RANDOM_STATE')
PATH_PROCESSED_DATA = config.get('Paths', 'PATH_PROCESSED_DATA')
PATH_PROCESSED_SCHEMA = config.get('Paths', 'PATH_PROCESSED_SCHEMA')
DIR_MLRUNS = config.get('Paths', 'DIR_MLRUNS')
experiment_name = "F4"

In [3]:
# Chargement des données prétraitées
with open(PATH_PROCESSED_DATA, 'rb') as file:
    X_train, X_test, y_train, y_test = pickle.load(file)

In [4]:
# Initialisation
mlflow.set_tracking_uri("file:" + os.path.abspath(DIR_MLRUNS))
mlflow.sklearn.autolog(disable=True)

In [5]:
# Recuperation de experiment
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

In [6]:
def model_run(model,run_name,param_grid,step,X_train,X_test, y_train, y_test):

    # Entrainement du modèle
    grid = GridSearchCV(model,param_grid,cv=10,verbose=2,scoring="accuracy")
    grid.fit(X_train, y_train)

    # Prediction sur les données de tests
    y_pred = grid.best_estimator_.predict(X_test)

    # Calcul des metriques
    average = "macro"
    metrics = {
        "best_score": grid.best_score_,
        "test_accuracy_score": accuracy_score(y_test, y_pred),
        "test_f1_score": f1_score(y_test, y_pred, average=average),
        "test_precision_score": precision_score(y_test, y_pred, average=average),
        "test_recall_score": recall_score(y_test, y_pred, average=average)
    }

    # Logs
    model_path = "model"
    mlflow.sklearn.log_model(grid.best_estimator_,model_path,registered_model_name=f"vehicule_classifier_{run_name}")
    mlflow.log_params(grid.best_estimator_.get_params())
    mlflow.log_metrics(metrics,step=step)
    mlflow.log_artifact(PATH_PROCESSED_SCHEMA, model_path)

In [11]:
runs_config = [
    {
        "model": LogisticRegression(),
        "run_name":"logr",
        "param_grid": {
            #"solver": ["liblinear","newton-cholesky"], 
            #"C":[0.2,0.5]
            #"random_state":[0,1]
        }
    },
    {
        "model": DecisionTreeClassifier(),
        "run_name":"decision_tree",
        "param_grid":  {
            "criterion": ["gini","entropy"],
            "max_depth": [4,8],
            "random_state":[42]
        }
    },
    {
        "model": RandomForestClassifier(),
        "run_name":"random_forest",
        "param_grid":   {
            "n_estimators": [50,100],
            "max_depth": [4,8],
            "random_state":[42]
        }
    },
    {
        "model": KNeighborsClassifier(),
        "run_name":"knn",
        "param_grid":   {
            "n_neighbors": [4,5],
            "algorithm": ['auto','ball_tree','kd_tree']
        }
    },
    {
        "model": SVC(),
        "run_name":"SVC",
        "param_grid":   {}
    }
]



In [12]:
runs = [runs_config[1], runs_config[2]]
runs

[{'model': DecisionTreeClassifier(),
  'run_name': 'decision_tree',
  'param_grid': {'criterion': ['gini', 'entropy'],
   'max_depth': [4, 8],
   'random_state': [42]}},
 {'model': RandomForestClassifier(),
  'run_name': 'random_forest',
  'param_grid': {'n_estimators': [50, 100],
   'max_depth': [4, 8],
   'random_state': [42]}}]

In [13]:
for i, config in enumerate(runs):
    print("Train :",config["run_name"])
    with mlflow.start_run(run_name=config["run_name"],experiment_id=experiment_id) as run:
        model_run(config["model"],config["run_name"],config["param_grid"],i,X_train,X_test, y_train, y_test)

Train : decision_tree
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.2s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.2s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.2s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=4, random_state=42; total time=   0.3s
[CV] END .......criterion=gini, max_depth=8, random_state=42; total time=   0.5s
[CV] END .......criterion=

Registered model 'vehicule_classifier_decision_tree' already exists. Creating a new version of this model...
2023/04/07 20:22:03 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: vehicule_classifier_decision_tree, version 14
Created version '14' of model 'vehicule_classifier_decision_tree'.


Train : random_forest
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   9.6s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   8.5s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   8.4s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=  10.8s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   9.4s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   8.4s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   9.5s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   9.7s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=  11.7s
[CV] END ......max_depth=4, n_estimators=50, random_state=42; total time=   8.6s
[CV] END .....max_depth=4, n_estimators=100, random_state=42; total time=  15.1s
[CV] END .....max_depth=4,

Registered model 'vehicule_classifier_random_forest' already exists. Creating a new version of this model...
2023/04/07 20:31:40 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: vehicule_classifier_random_forest, version 20
Created version '20' of model 'vehicule_classifier_random_forest'.
