In [None]:
import mlflow
import json
import matplotlib.pyplot as plt
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# ============ LOAD LABELLED DATA ============
labelled_data_cache = pd.read_pickle('data_cache/labelled_data.pkl')
X_train = labelled_data_cache['X_train']
X_test = labelled_data_cache['X_test']
y_train = labelled_data_cache['y_train']
y_test = labelled_data_cache['y_test']

In [None]:
# =========== MLFLOW SETUP ============
mlflow.set_experiment("feature_selection")

# ============ TRAIN MODEL ============
with mlflow.start_run(run_name="boruta_Label_7day"):
    # Boruta Parameters
    mlflow.log_params({
        "method": "BorutaPy",
        "estimator": "RandomForestClassifier",
        "n_estimators": "auto",
        "max_iter": 100,
        "alpha": 0.05,
        "perc": 100,
        "two_step": True,
        "target": "Label_7day",
        "n_input_features": X_train.shape[1],
        "train_size": len(X_train),
    })

    borutaRandomForest = RandomForestClassifier(n_jobs=-1, class_weight="balanced",
                                max_depth=5, random_state=42)
    
    featureSelector = BorutaPy(borutaRandomForest, n_estimators="auto", max_iter=100,
                        alpha=0.05, perc=100, verbose=2, random_state=42)
    
    featureSelector.fit(X_train.values, y_train.values)

    confirmedFeatures  = X_train.columns[featureSelector.support_].tolist() #The mask of selected features - only confirmed ones are True
    tentativeFeatures  = X_train.columns[featureSelector.support_weak_].tolist()
    rejectedFeatures   = X_train.columns[~featureSelector.support_ & ~featureSelector.support_weak_].tolist()

    # Log metrics
    mlflow.log_metrics({
        "n_confirmed": len(confirmedFeatures),
        "n_tentative": len(tentativeFeatures),
        "n_rejected":  len(rejectedFeatures),
    })
    
    # Log features as parameters
    for feature in confirmedFeatures:
        mlflow.log_param(f"status_{feature}", "Confirmed")
    for feature in tentativeFeatures:
        mlflow.log_param(f"status_{feature}", "Tentative")
    for feature in rejectedFeatures:
        mlflow.log_param(f"status_{feature}", "Rejected")
        
    # Log selected features as an artifact
    selected_features_df = pd.DataFrame({
        "Feature": confirmedFeatures + tentativeFeatures})
    
    selected_features_df.to_json("data_cache/selected_features.json", index=False)
    mlflow.log_artifact("data_cache/selected_features.json")