In [138]:
import sys

sys.path.append('../src')

from swfilter import sw_outlier_detector

In [139]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy.io import arff
import mlflow


First, run this from terminal :

mlflow server --host 127.0.0.1 --port 8081

In [148]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8081")
# Create a new MLflow Experiment
experiment_id = "swfilter_test"
experiment = mlflow.set_experiment(experiment_id)

# Get Experiment Details
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Name: {}".format(experiment.name))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Experiment_id: 791358303679491057
Artifact Location: mlflow-artifacts:/791358303679491057
Name: swfilter_test
Tags: {}
Lifecycle_stage: active


In [142]:
def import_dataset(folder_name:str, dataset_name:str)->pd.DataFrame:
    arff_file = arff.loadarff(f'../datasets/{folder_name}/{dataset_name}.arff')
    df_file = pd.DataFrame(arff_file[0])
    features = df_file.drop(columns=['outlier', 'id'])
    label = df_file['outlier']
    return features, label, df_file
    

In [174]:
#features, label, data = import_dataset('Lymphography', 'Lymphography_withoutdupl_norm_idf')
dataset_name = 'Ionosphere'
dataset_precise_name = 'Ionosphere_withoutdupl_norm'
features, label, data = import_dataset(dataset_name, dataset_precise_name)
X = features.values
y = label.values == b'yes'
y = np.where(y, -1, 1)
display(y)



array([ 1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
        1, -1,  1, -1,  1

## Comparateurs

In [164]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest


# Local outlier factor
n_neighbors = 20
contamination = 0.1
leaf_size = 30
clf_lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination, algorithm='auto', n_jobs=-1, leaf_size=leaf_size)
y_lof = clf_lof.fit_predict(X)



In [165]:
# Isolation forest
max_samples = 'auto'
contamination = 'auto'
max_features = X.shape[1]
clf_forest = IsolationForest(max_samples=max_samples, contamination=contamination, max_features=max_features, random_state=42, n_jobs=-1)
y_forest = clf_forest.fit_predict(X)

In [167]:
eps = 0.01
n = 30
n_projections = 100
p = 0.9
# SW filter original
y_swo = sw_outlier_detector(data = X, eps = eps, n = n, n_projections=n_projections, seed = 42, p = p, n_jobs=-1, swtype='original')



In [168]:
# SW filter
eps_spherical = 0.002
y_sws = sw_outlier_detector(data = X, eps = eps_spherical, n = 3, n_projections=10, seed = 42,  p = p, n_jobs=-1, swtype='spherical')


In [169]:
models = ['outlier_bool', 'lof', 'forest', 'swo', 'sws']
data['lof'] = y_lof < 0
data['forest'] = y_forest < 0
data['swo'] = y_swo
data['sws'] = y_sws
data['outlier_bool'] = data['outlier'] == b'yes'

In [170]:
dict_results = {}
for model in models:
    true_positives = (data.loc[(data['outlier_bool'] == True) & (data[model] == True)]).count()[0]
    false_positives =(data.loc[(data['outlier_bool'] == False) & (data[model] == True)]).count()[0]
    true_negatives = (data.loc[(data['outlier_bool'] == False) & (data[model] == False)]).count()[0]
    false_negatives = (data.loc[(data['outlier_bool'] == True) & (data[model] == False)]).count()[0]
    precision = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    dict_results[model] = {'true_positives': true_positives, 'false_positives': false_positives, 'true_negatives': true_negatives, 'false_negatives': false_negatives, 'precision': precision}
    

df_results = pd.DataFrame(dict_results)

display(df_results)

Unnamed: 0,outlier_bool,lof,forest,swo,sws
true_positives,126.0,33.0,76.0,0.0,17.0
false_positives,0.0,2.0,22.0,0.0,0.0
true_negatives,225.0,223.0,203.0,225.0,225.0
false_negatives,0.0,93.0,50.0,126.0,109.0
precision,1.0,0.729345,0.794872,0.641026,0.689459


## hyperopt

In [171]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, make_scorer
from functools import partial
import mlflow

# Define search spaces
space_lof = {
    'n_neighbors': hp.quniform('n_neighbors', 5, 50,1),
    'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
    'leaf_size': hp.quniform('leaf_size', 5, 50, 1),
    'metric': hp.choice('metric', ['euclidean', 'manhattan', 'chebyshev', 'minkowski'])
}

space_forest = {
    'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
    'max_samples': hp.quniform('max_samples', 1, X.shape[1],1),
    'contamination': hp.loguniform('contamination', -7, -0.70),
    'max_features': hp.quniform('max_features', 1, 10, 1)
}


# Adjusting the objective functions to include cross-validation on training data and evaluation on testing data

def objective_lof(space, X_train, Y_train, X_test, Y_test, scoring, experiment_id, dataset_name):
    mlflow.set_experiment(experiment_id)
    with mlflow.start_run():
        n_neighbors = int(space['n_neighbors'])
        algorithm = space['algorithm']
        leaf_size = int(space['leaf_size'])
        metric = space['metric']
        
        model = LocalOutlierFactor(
            n_neighbors=n_neighbors,
            algorithm=algorithm,
            leaf_size=leaf_size,
            metric=metric,
            novelty=True
        )
        
        # Cross-validation on training data
        cv_score = cross_val_score(model, X_train, Y_train, cv=5, scoring=scoring).mean()
        
        # Fit the model on the entire training dataset
        model.fit(X_train)
        
        # Evaluate on the testing data
        Y_pred = model.predict(X_test)
        test_score = scoring(model, X_test, Y_test)
        
        # Log parameters, cross-validation score, and testing score
        mlflow.log_param("n_neighbors", n_neighbors)
        mlflow.log_param("algorithm", algorithm)
        mlflow.log_param("leaf_size", leaf_size)
        mlflow.log_param("metric", metric)
        mlflow.log_param("dataset_name", dataset_name)
        mlflow.log_metric("cv_score", cv_score)
        mlflow.log_metric("test_score", test_score)
        mlflow.set_tag("model", "LocalOutlierFactor")
        
        return {'loss': -cv_score, 'status': STATUS_OK}

def objective_forest(space, X_train, Y_train, X_test, Y_test, scoring, experiment_id, dataset_name):
    mlflow.set_experiment(experiment_id)
    with mlflow.start_run():
        n_estimators = int(space['n_estimators'])
        max_samples = int(space['max_samples'])
        contamination = space['contamination']
        max_features = int(space['max_features'])
        
        model = IsolationForest(
            n_estimators=n_estimators,
            max_samples=max_samples,
            contamination=contamination,
            max_features=max_features,
            random_state=42
        )
        
        # Cross-validation on training data
        cv_score = cross_val_score(model, X_train, Y_train, cv=5, scoring=scoring).mean()
        
        # Fit the model on the entire training dataset
        model.fit(X_train)
        
        # Evaluate on the testing data
        Y_pred = model.predict(X_test)
        test_score = scoring(model, X_test, Y_test)
        
        # Log parameters, cross-validation score, and testing score
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_samples", max_samples)
        mlflow.log_param("contamination", contamination)
        mlflow.log_param("max_features", max_features)
        mlflow.log_param("dataset_name", dataset_name)
        mlflow.log_metric("cv_score", cv_score)
        mlflow.log_metric("test_score", test_score)
        mlflow.set_tag("model", "IsolationForest")
        
        return {'loss': -cv_score, 'status': STATUS_OK}
    



In [172]:
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, y, test_size=0.4, random_state=42)

In [175]:
# Example of using partial to create a function with pre-filled parameters
# Create a precision scorer object
scoring = make_scorer(precision_score, zero_division=0)
objective_lof_partial = partial(objective_lof, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, scoring=scoring, experiment_id=experiment_id, dataset_name=dataset_name)
objective_forest_partial = partial(objective_forest, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, scoring=scoring, experiment_id=experiment_id, dataset_name=dataset_name)
max_evals = 20
# Example of running a trial for LOF with the partial function
trials_lof = Trials()
best_lof = fmin(fn=objective_lof_partial,
                space=space_lof,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials_lof)

# Example of running a trial for Isolation Forest with the partial function
trials_iforest = Trials()
best_iforest = fmin(fn=objective_forest_partial,
                    space=space_forest,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials_iforest)

100%|██████████| 20/20 [00:01<00:00, 10.12trial/s, best loss: -0.9275252877793712]
100%|██████████| 20/20 [00:13<00:00,  1.50trial/s, best loss: -0.7999245852187029]
