In [1]:
import requests
import time
import pandas as pd
import numpy as np
import json
from scipy.stats import uniform
from statistics import mean
from joblib import dump, load
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [4]:
def refit(result):
    temp = pd.DataFrame(result).to_csv("/tmp/tempdf.csv", index_label="result_idx")
    hp = pd.read_csv("/tmp/tempdf.csv").sort_values(
        by=['mean_test_accuracy', 'mean_test_balanced_accuracy', 'mean_test_recall_macro', 'mean_test_f1_macro', 'mean_test_precision_macro', 'mean_test_roc_auc', 'mean_fit_time'],
        ascending=[False, False, False, False, False, False, True],
    ).iloc[0]
    return hp.result_idx

scoring = ['accuracy', 'balanced_accuracy', 'recall_macro', 'f1_macro', 'precision_macro', 'roc_auc']

datasets = [
    {"dataset": "10", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset10.csv"},
#     {"dataset": "13", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset13.csv"},
#     {"dataset": "15", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset15.csv"},
#     {"dataset": "17", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset17.csv"},
#     {"dataset": "20", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset20.csv"},
]

mls = [
#     {"clf": DecisionTreeClassifier(), "model": "DecisionTree"},
#     {"clf": LogisticRegression(), "model": "LogisticRegression"},
    {"clf": RandomForestClassifier(), "model": "RandomForest"},
    {"clf": XGBClassifier(use_label_encoder=False), "model": "XGBoost"},
    {"clf": BaggingClassifier(), "model": "Bagging"},
]

params = {
    "DecisionTree": {
        "criterion": ['gini', 'entropy'],
        "splitter": ['best', 'random'],
        "max_features": ['auto', 'sqrt'],
        "max_depth": [None, 1, 3, 5, 10, 15, 20],
        "min_samples_split": [5, 10],
        "min_samples_leaf": [5, 10],
    },
    "RandomForest": {
        "criterion": ['gini', 'entropy'],
        "max_features": ['auto', 'sqrt'],
        "max_depth": [None, 1, 3, 5, 10],
        "min_samples_split": [1, 2, 3, 4, 5, 6, 7, 8, 9 ,10],
        "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9 ,10],
        "bootstrap": [True, False],
    },
    "XGBoost": {
        "booster": ['gbtree', 'dart'],
        "max_depth": [None, 1, 3, 5, 10],
        "n_estimators": [50, 100, 150, 200, 250],
        "eta": np.linspace(0.01,0.03,50),
    },
    "Bagging": {
        "n_estimators": [50, 100, 150, 200, 250],
        "bootstrap": [True, False],
    },
    "LogisticRegression": {
        "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        "tol": np.linspace(0.01,0.002,50),
        "C": uniform(loc=0, scale=4),
        "max_iter": [100, 150, 200, 250, 300],
        "penalty": ['l2', 'l1', 'elasticnet'],
    }
}

scaler = StandardScaler()

cv = StratifiedKFold(n_splits=10)

N_ITER = 10
N_JOBS = 4

In [None]:
for dts in datasets:

    dataset = pd.read_csv(dts["path"])
    X = dataset.drop(columns=["Label"])
    y = dataset["Label"]
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=1 / 7.0, random_state=1
    )

    for ml in mls:
        estimator = ml["clf"]
        start = time.time()
        search = RandomizedSearchCV(estimator, params[ml["model"]], n_iter=N_ITER, refit=refit, scoring=scoring, cv=cv, n_jobs=N_JOBS)
        search.fit(x_train, y_train)
        end = time.time()

        requests.post(
            "http://localhost:8000/api/v1/hyperparameter",
            json={
                "code": "001-WithoutScaler",
                "algorithm": ml["model"],
                "dataset": "IoT-23 Mirai 48-1 + IoT Traffic Traces " + dts["dataset"],
                "params": json.dumps(search.best_params_),
                "time": float(end-start)
            }
        )

    del dataset
    del X
    del y