In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [2]:
datasets = [
    {"dataset": "10", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset10.csv"},
    {"dataset": "13", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset13.csv"},
    {"dataset": "15", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset15.csv"},
    {"dataset": "17", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset17.csv"},
    {"dataset": "20", "path": "/media/kmdr7/Seagate/TA/DATASETS/Preparation/Feature Importance/Dataset20.csv"},
]

mls = [
    {"clf": AdaBoostClassifier(), "model": "AdaBoost"},
    {"clf": DecisionTreeClassifier(criterion="entropy"), "model": "DecisionTree"},
    {"clf": LinearSVC(), "model": "SVM"},
    {"clf": MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000), "model": "ANN"},
    {"clf": LogisticRegression(solver='newton-cg'), "model": "LogisticRegression"},
    {"clf": RandomForestClassifier(), "model": "RandomForest"},
    {"clf": XGBClassifier(use_label_encoder=False), "model": "XGBoost"},
    {"clf": BaggingClassifier(), "model": "Bagging"},
]

In [3]:
dataset = pd.read_csv("/media/kmdr7/Seagate/TA/DATASETS/newDatasetSampledEncoded.csv")[cols]

In [4]:
X = dataset.drop(columns=["Label"])
y = dataset["Label"]

In [5]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [6]:
estimator = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])

In [7]:
clf = RandomizedSearchCV(estimator, distributions, random_state=0, n_jobs=6)
search = clf.fit(X, y)

In [8]:
search.best_params_

{'C': 0.22685190926977272, 'penalty': 'l2'}