In [1]:
# Constants
Z_LAYER_SIZE = 32
TEST_PERCENT = 0.2
MODEL_NAME = ""

In [2]:
# Crash if AE didn't run

try:
    open("failed", "r")
    crash()
except FileNotFoundError:
    pass

NameError: name 'crash' is not defined

In [3]:
# Load model
import pandas as pd
from statistics import mean

try:
    MODEL_NAME = pd.read_csv("ae_trials.csv").to_dict("records")[-1]["name"]
    Z_LAYER_SIZE = pd.read_csv("ae_trials.csv").to_dict("records")[-1]["input_size"]//pd.read_csv("ae_trials.csv").to_dict("records")[-1]["z_layer_divisor"]
except (FileNotFoundError, pd.errors.EmptyDataError):
    pass

def save_results(name, hyper_params, y_test, y_predict_probas, no_proba=False):
    try:
        previous_trials = pd.read_csv("latent_trials.csv").to_dict("records")
    except (FileNotFoundError, pd.errors.EmptyDataError):
        previous_trials = []

    model_final_stats = {"Classifier": name, "Based on AE": MODEL_NAME}
    model_final_stats.update(hyper_params)
    
    y_final_pred = [round(v) for v in y_predict_probas]
    f_score = precision_recall_fscore_support(y_test, y_final_pred, average="binary")
    
    try:
        model_final_stats["precision"] = f_score[0]
        model_final_stats["recall"] = f_score[1]
        model_final_stats["f-score"] = f_score[2]
    except ValueError:
        model_final_stats["precision"] = None
        model_final_stats["recall"] = None
        model_final_stats["f-score"] = None
    
    if not no_proba:
        try:
            model_final_stats["auc"] = roc_auc_score(y_test, y_predict_probas)
        except ValueError:
            model_final_stats["auc"] = None
    else:
        model_final_stats["auc"] = None

    if not (model_final_stats["recall"] == 1.0 and model_final_stats["precision"] < 0.6):
        previous_trials.append(model_final_stats)
        pd.DataFrame(previous_trials).to_csv("latent_trials.csv", index=None)
    
    print(model_final_stats)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# Load datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

folds = []

split = pd.read_pickle("z_layers/%s.pickle.gzip" % MODEL_NAME).values.tolist()
split = [(d[2:], d[0]) for d in split]

x_train, x_test, y_train, y_test = train_test_split([s[0] for s in split], [s[1] for s in split], test_size=TEST_PERCENT)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

hyper_params = {"max_iter": [100]}
clf = LogisticRegression()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("LogisticRegression", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'LogisticRegression', 'Based on AE': 20240128151358, 'max_iter': 100, 'precision': 0.3870967741935484, 'recall': 0.5217391304347826, 'f-score': 0.4444444444444444, 'auc': 0.4571805006587615}


In [6]:
from sklearn.ensemble import ExtraTreesClassifier

hyper_params = {"n_estimators": [10, 100, 1000]}
clf = ExtraTreesClassifier()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("ExtraTreesClassifier", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'ExtraTreesClassifier', 'Based on AE': 20231117064744, 'n_estimators': 10, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [7]:
from sklearn.ensemble import RandomForestClassifier

hyper_params = {"n_estimators": [10, 100, 1000]}
clf = RandomForestClassifier()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("RandomForestClassifier", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'RandomForestClassifier', 'Based on AE': 20231117064744, 'n_estimators': 100, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [8]:
from sklearn.ensemble import GradientBoostingClassifier

hyper_params = {"max_depth": [5], "min_samples_leaf": [1]}
clf = GradientBoostingClassifier()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("GradientBoostingClassifier", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'GradientBoostingClassifier', 'Based on AE': 20231117064744, 'max_depth': 5, 'min_samples_leaf': 1, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [9]:
# XGB
from xgboost import XGBClassifier

hyper_params = {"numEstimators": [1000]}
clf = XGBClassifier()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("XGBClassifier", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

Parameters: { "numEstimators" } are not used.

Parameters: { "numEstimators" } are not used.

Parameters: { "numEstimators" } are not used.

Parameters: { "numEstimators" } are not used.

Parameters: { "numEstimators" } are not used.

Parameters: { "numEstimators" } are not used.



{'Classifier': 'XGBClassifier', 'Based on AE': 20231117064744, 'numEstimators': 1000, 'precision': 0.0, 'recall': 0.0, 'f-score': 0.0, 'auc': 0.5}


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.svm import SVC

hyper_params = {"kernel": ["rbf"], "C": [1, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1], "probability": [True]}
clf = SVC()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("SVC", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'SVC', 'Based on AE': 20231117064744, 'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [11]:
from sklearn.neighbors import KNeighborsClassifier

hyper_params = {"algorithm": ["kd_tree"], "leaf_size": list(range(1, 50 + 1, 5)), "n_neighbors": list(range(1, 20 + 1, 5))}
clf = KNeighborsClassifier()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train)
test_x = scaler.transform(x_test)

grid_search.fit(x_train, y_train)
save_results("KNeighborsClassifier", grid_search.best_params_, y_test, grid_search.predict_proba(x_test)[:, 1].tolist())

{'Classifier': 'KNeighborsClassifier', 'Based on AE': 20231117064744, 'algorithm': 'kd_tree', 'leaf_size': 1, 'n_neighbors': 1, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [12]:
# # Run a t-SNE for some epic graphs
# from sklearn.manifold import TSNE
# from matplotlib import pyplot as plt
# import numpy as np

# t_sne = TSNE()
# test_x = t_sne.fit_transform(np.array(x_test))
# test_x_pos = [(x[0], x[1]) for x, y in zip(x_test, y_test) if y == 1]
# test_x_neg = [(x[0], x[1]) for x, y in zip(x_test, y_test) if y == 0]
# plt.scatter([v[0] for v in test_x_pos], [v[1] for v in test_x_pos], label="Signal")
# plt.scatter([v[0] for v in test_x_neg], [v[1] for v in test_x_neg], label="Noise")
# plt.legend()
# plt.title("t-SNE: %s" % MODEL_NAME)
# plt.show()

In [12]:
# One class GaussianMixture
from sklearn.mixture import GaussianMixture
import numpy as np

x_train, y_train = np.array(x_train), np.array(y_train)

class GaussianOutlierClassifier(GaussianMixture):
    def __init__(self, sigma_base=0, sigma_mult=1):
        super().__init__()
        self.sigma_base = sigma_base
        self.sigma_mult = sigma_mult
        self.sigma = sigma_base * sigma_mult

    def fit(self, X, y, **fit_params):
        self.set_params(**fit_params)
        return super().fit(X, y)
    
    def predict(self, X):
        return (self.score_samples(X) < self.sigma).astype(float)

    def set_params(self, **params):
        self.sigma_base = params.get("sigma_base", self.sigma_base)
        self.sigma_mult = params.get("sigma_mult", self.sigma_mult)
        self.sigma = self.sigma_base * self.sigma_mult
        super().set_params(**params)
        return self
        
hyper_params = {"sigma_base": np.arange(-10, 10, 0.5), "sigma_mult": [0.01, 0.1, 0, 1, 10, 100, 1000]}
clf = GaussianOutlierClassifier()

grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1", error_score="raise")
scaler = MinMaxScaler()

train_x = scaler.fit_transform(x_train[y_train==0])
test_x = scaler.transform(x_test)

grid_search.fit(train_x, y_train[y_train==0])
save_results("GMM", grid_search.best_params_, y_test, grid_search.predict(test_x).tolist())

# pd.DataFrame({"Real Y": np.array(y_test).astype(int), "Pred Y": grid_search.predict(test_x).astype(int)}).to_csv("%d.csv" % MODEL_NAME)

In [14]:
# from sklearn.ensemble import IsolationForest

# class IFOutlierClassifier(IsolationForest):
#     def __init__(self, n_estimators=100, sigma_base=0, sigma_mult=1):
#         super().__init__(n_estimators=n_estimators)
#         self.sigma_base = sigma_base
#         self.sigma_mult = sigma_mult
#         self.sigma = sigma_base * sigma_mult

#     def fit(self, X, y, **fit_params):
#         self.set_params(**fit_params)
#         return super().fit(X, y)
    
#     def predict(self, X):
#         return (self.score_samples(X) < self.sigma).astype(float)

#     def set_params(self, **params):
#         self.sigma_base = params.get("sigma_base", self.sigma_base)
#         self.sigma_mult = params.get("sigma_mult", self.sigma_mult)
#         self.sigma = self.sigma_base * self.sigma_mult
#         super().set_params(**params)
#         return self

# hyper_params = {"n_estimators": [10, 100, 1000], "sigma_base": np.arange(-10, 10, 0.5), "sigma_mult": [0.01, 0.1, 0, 1, 10, 100, 1000]}
# clf = IFOutlierClassifier()

# scaler = MinMaxScaler()
# grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

# train_x = scaler.fit_transform(x_train[y_train==0])
# test_x = scaler.transform(x_test)

# grid_search.fit(train_x, y_train[y_train==0])
# save_results("IsolationForrestClassifier", grid_search.best_params_, y_test, grid_search.predict(x_test).tolist())

In [15]:
# from sklearn.neighbors import LocalOutlierFactor

# class LocalOutlierClassifier(LocalOutlierFactor):
#     def __init__(self, algorithm="kd_tree", leaf_size=50, n_neighbors=30, sigma_base=0, sigma_mult=1):
#         super().__init__(algorithm=algorithm, leaf_size=leaf_size, n_neighbors=n_neighbors, novelty=True)
#         self.sigma_base = sigma_base
#         self.sigma_mult = sigma_mult
#         self.sigma = sigma_base * sigma_mult

#     def fit(self, X, y, **fit_params):
#         self.set_params(**fit_params)
#         return super().fit(X, y)
    
#     def predict(self, X):
#         return (self.score_samples(X) < self.sigma).astype(float)

#     def set_params(self, **params):
#         self.sigma_base = params.get("sigma_base", self.sigma_base)
#         self.sigma_mult = params.get("sigma_mult", self.sigma_mult)
#         self.sigma = self.sigma_base * self.sigma_mult
#         super().set_params(**params)
#         return self

# hyper_params = {"algorithm": ["kd_tree"], "leaf_size": list(range(1, 50 + 1, 5)), "n_neighbors": list(range(1, 30 + 1, 5)), "sigma_base": np.arange(-10, 10, 0.5), "sigma_mult": [0.01, 0.1, 0, 1, 10, 100, 1000]}
# clf = LocalOutlierClassifier()

# scaler = MinMaxScaler()
# grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

# train_x = scaler.fit_transform(x_train[y_train==0])
# test_x = scaler.transform(x_test)

# grid_search.fit(train_x, y_train[y_train==0])
# save_results("LocalOutlierClassifier", grid_search.best_params_, y_test, grid_search.predict(x_test)[:, 1].tolist())

In [16]:
# ABOD
from pyod.models.abod import ABOD

hyper_params = {"contamination": [0.0000001], "n_neighbors": list(range(1, 20 + 1, 5))}
clf = ABOD()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train[y_train==0])
test_x = scaler.transform(x_test)

grid_search.fit(train_x, y_train[y_train==0])
save_results("ABOD", grid_search.best_params_, y_test, grid_search.predict(test_x).tolist())

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, di

{'Classifier': 'ABOD', 'Based on AE': 20231117064744, 'contamination': 1e-07, 'n_neighbors': 1, 'precision': 0.0, 'recall': 0.0, 'f-score': 0.0, 'auc': 0.5}


Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# COPOD
from pyod.models.copod import COPOD

hyper_params = {"contamination": [0.0000001]}
clf = ABOD()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train[y_train==0])
test_x = scaler.transform(x_test)

grid_search.fit(train_x, y_train[y_train==0])
save_results("COPOD", grid_search.best_params_, y_test, grid_search.predict(test_x).tolist())

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", lin

{'Classifier': 'COPOD', 'Based on AE': 20231117064744, 'contamination': 1e-07, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}


In [18]:
# COPOD
from pyod.models.hbos import HBOS

hyper_params = {"contamination": [0.0000001], "n_bins": list(range(3, 20 + 1, 5))}
clf = HBOS()

scaler = MinMaxScaler()
grid_search = GridSearchCV(clf, hyper_params, refit=True, n_jobs=-1, scoring="f1")

train_x = scaler.fit_transform(x_train[y_train==0])
test_x = scaler.transform(x_test)

grid_search.fit(train_x, y_train[y_train==0])
save_results("HBOS", grid_search.best_params_, y_test, grid_search.predict(test_x).tolist())

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 811, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/home/ian/miniconda3/envs/tf/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", lin

{'Classifier': 'HBOS', 'Based on AE': 20231117064744, 'contamination': 1e-07, 'n_bins': 3, 'precision': 1.0, 'recall': 1.0, 'f-score': 1.0, 'auc': 1.0}
