In [None]:
def train_model(model:sklearn_utils.ScikitModel, X:pd.DataFrame, y, train_indices:np.ndarray, test_indices):
    X_train, X_test = X.iloc[train_indices, :], X.iloc[test_indices, :]
    y_train, y_test = y[train_indices], y[test_indices]
    model.fit(X_train, y_train)
    metrics_dict = calculate_clf_metrics_averaged(model, X_test, y_test)._asdict()
    return metrics_dict

In [None]:
def custom_cross_val(model:sklearn_utils.ScikitModel, X:pd.DataFrame, y:np.ndarray, cv:int=5):
    skf = StratifiedKFold(cv=cv)
    output = Parallel()( delayed(train_model)(
        model, X, y, 
        train_indices, 
        test_indices
    ) for train_indecies, test_indecies in skf)
    return output

In [29]:
def run_experiment(experiment_name:str, X_train:pd.DataFrame, y_train:np.ndarray,
                   X_test:pd.DataFrame, y_test:np.ndarray, 
                   models:dict[str, sklearn_utils.ScikitModel], 
                   artifacts_folder:Path=None):

    if artifacts_folder is None:
        experiment_dirname = f'exp_{experiment_name}_artifacts'
        artifacts_folder = files_utils.create_dir(experiment_dirname, replace_existing=True)
    elif not Path.is_dir(artifacts_folder):
        raise OSError(f'Directory "{artifacts_folder}" doesn\'t exist, you may want to create it first.')

    train_data_path = artifacts_folder/'X_train.csv'
    test_data_path = artifacts_folder/'y_train.txt'
    X_train.to_csv(train_data_path, index=False)
    np.savetxt(test_data_path, y_train, delimiter=',')
    
    mlflow.set_experiment(experiment_name)

    progress_bar_models = tqdm(models.items())
    for model_name, model in progress_bar_models:
        progress_bar_models.set_description(f'Logging model [ {model_name} ]')
        
        with mlflow.start_run(run_name=f'{model_name}_run'):
            # logging training data
            # train_data = mlflow.data.from_pandas(X_train, train_data_path)
            # test_data = mlflow.data.from_numpy(y_train, test_data_path)
            # dataset = {"X_train": train_data, 
            #            "y_train": test_data}
            # for name, data in dataset.items():
            #     mlflow.log_input(data, context=get_data_context(name))

            # Model fitting and calculating metrics
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            pred_probas = model.predict_proba(X_test)
            metrics_dict = calculate_clf_metrics_averaged(model, X_test, y_test)._asdict()

            # logging the metrics
            for metric_name, metric_value in metrics_dict.items():
                mlflow.log_metric(metric_name, metric_value)
    
            # logging the model
            mlflow.sklearn.log_model(model, model_name)

            # logging metrics visualizations
            extension = 'png'
            
            plt.ioff()
            
            cm_filename = f'confusion_matrix_{model_name}'
            cm_path = artifacts_folder/f"{cm_filename}.{extension}"
            cm = visualize_confusion_matrix(y_test, pred, labels=['Poor', 'Standard', 'Good'])
            save_visualization(cm, cm_path)
            mlflow.log_artifact(cm_path, cm_filename)
            
            
            roc_auc_filename = f'roc_auc_{model_name}'
            roc_auc_path = artifacts_folder/f"{roc_auc_filename}.{extension}"
            roc_auc_display = visualize_roc(y_test, pred_probas, ['Poor', 'Standard', 'Good'])
            save_visualization(roc_auc_display, roc_auc_path)
            mlflow.log_artifact(roc_auc_path, roc_auc_filename)