In [1]:
import sys

sys.path.append('../src')

from swfilter import SlicedWassersteinFilter, FastSlicedWassersteinFilter, SplitSlicedWassersteinFilter, SmartSplitSlicedWassersteinFilter

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy.io import arff
import mlflow
import seaborn as sns

In [3]:
dataset_list = [
                ['Shuttle','Shuttle_withoutdupl_norm_v01'],
                #['KDDCup99','KDDCup99_withoutdupl_norm_catremoved'],
                #['ALOI','ALOI_withoutdupl_norm'],
                ]

In [4]:
def import_dataset(folder_name:str, dataset_name:str)->pd.DataFrame:
    arff_file = arff.loadarff(f'../datasets/{folder_name}/{dataset_name}.arff')
    df_file = pd.DataFrame(arff_file[0])
    features = df_file.drop(columns=['outlier', 'id'])
    label = df_file['outlier']
    return features, label, df_file

def generate_data(folder_name:str, dataset_name:str, test_size:float=0.5, seed:int=42):
    features, label, df_file = import_dataset(folder_name, dataset_name)
    X = np.array(features.values)
    y = np.array(label.values == b'yes')
    y= np.where(y, -1, 1)

    if test_size == 1.0:
        X_test = X
        Y_test = y
        X_train = None 
        Y_train = None
    else:
        X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
    data = {'X_train':X_train, 'X_test':X_test, 'Y_train':Y_train, 'Y_test':Y_test, 'X':X, 'y':y}
    return data


In [5]:
dataset = dataset_list[0]
dataset_name = dataset[0]
data = generate_data(dataset[0], dataset[1], test_size=1.0, seed=42)
X_train = data['X_train']
X_test = data['X_test']
Y_train = data['Y_train']
Y_test = data['Y_test']
X = data['X']
y = data['y']

print(X_test.shape)

(1013, 9)


In [6]:
import time
start_time = time.time()
eps = 0.02
n = 15
n_projections = 20
p = 0.7

model = SmartSplitSlicedWassersteinFilter(eps=eps, n=n, n_projections=n_projections, p=p, seed=42, n_jobs=1, swtype='original', n_clusters=5, n_splits=5)

In [7]:
res, mean = model.fit_predict(X_test)

In [9]:
print(res.shape)

(1013,)


In [46]:

"""
model = SplitSlicedWassersteinFilter(
    eps=eps,
    n=n,
    n_projections=n_projections,
    p=p,
    seed=42,
    n_jobs=-1,
    swtype='original', 
    n_clusters=100
)
"""

if X_train is not None:
    # Fit the model on the entire training dataset
    Y_pred, vote = model.fit_predict(X_train)
    train_accuracy = sk.metrics.accuracy_score(Y_train, Y_pred)
    train_precision = sk.metrics.precision_score(Y_train, Y_pred, zero_division=0)
    


# Fit the model on the entire training dataset
Y_pred, vote = model.fit_predict(X_test)
test_precision = sk.metrics.precision_score(Y_test, Y_pred, zero_division=0)
test_accuracy = sk.metrics.accuracy_score(Y_test, Y_pred)
test_recall_score = sk.metrics.recall_score(Y_test, Y_pred, zero_division=0)

true_positive = np.sum((Y_test == -1) & (Y_pred == -1))
false_positive = np.sum((Y_test == 1) & (Y_pred == -1))
true_negative = np.sum((Y_test == 1) & (Y_pred == 1))
false_negative = np.sum((Y_test == -1) & (Y_pred == 1))
end_time = time.time()

print(f"total time: {end_time - start_time}")
print(f"test_accuracy: {test_accuracy}")
print(f"test_precision: {test_precision}")
print(f"test_recall_score: {test_recall_score}")
print(f"true_positive: {true_positive}")
print(f"false_positive: {false_positive}")
print(f"true_negative: {true_negative}")
print(f"false_negative: {false_negative}")
       
        

total time: 305.28289818763733
test_accuracy: 0.7382807768401503
test_precision: 0.9715169445938677
test_recall_score: 0.7521134385541165
true_positive: 449
false_positive: 11905
true_negative: 36121
false_negative: 1059
