In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
from tqdm import tqdm
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from itertools import product
import random
import warnings
import os
import numpy as np
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings('ignore')

In [None]:
def modeling(train_set, test_set, features, label, fea_name):
    train = train_set.copy()
    test = test_set.copy()

    # Encode
    label_encoder = LabelEncoder()
    train[label] = label_encoder.fit_transform(train[label])
    test[label] = label_encoder.transform(test[label])

    # train, validation, test
    x_train, y_train = train[features], train[label]
    x_test, y_test = test[features], test[label]

    # features
    features_name = list(x_train.columns)

    # Normalization
    scaler_ = StandardScaler()
    x_train = scaler_.fit_transform(x_train)
    x_test = scaler_.transform(x_test)

    x_train = pd.DataFrame(x_train, columns=features_name)
    x_test = pd.DataFrame(x_test, columns=features_name)

    param_RF = {"n_estimators": np.random.randint(10, 501, size=300),
                "max_depth": np.random.randint(2, 50, size=25),
                "criterion": ["gini", "entropy"]}

    param_SVC = {"kernel": ["linear", "poly", "rbf"],
                 "C": [0.01, 0.1, 1, 10, 100],
                 'gamma': ['scale', 'auto'],
                 'coef0': [0.0, 0.01, 0.1]
                }

    param_KNN = {"n_neighbors": [3, 5, 7, 9, 11, 15, 17, 19, 21, 23],
                 "weights": ["uniform", "distance"],
                 "metric": ['euclidean', 'manhattan', 'chebyshev', "minkowski"],
                 "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
                 }

    param_LGBM = {"boosting_type": ["gbdt", "dart"],
                  "num_leaves": np.random.randint(15, 50, size=25),
                  "max_depth": np.random.randint(2, 50, size=25),
                  "learning_rate": [0.001, 0.01, 0.1, 1, 10],
                  "n_estimators": np.random.randint(10, 501, size=300),
                  "class_weight": ["balanced", None],
                  }

    param_ET = {"n_estimators": np.random.randint(10, 501, size=300),
                "max_depth": [None, 10, 15, 20, 25, 30, 35, 40, 50],
                "criterion": ["gini", "entropy"],
                "min_samples_split": [2, 3, 5, 7, 9, 11],
                "min_samples_leaf": [1, 3, 5, 8, 9, 11],
                "bootstrap": [True, False],
                "max_leaf_nodes": [None, 2, 3, 4, 5, 6, 8, 9, 10, 11]}

    param_CB = {'iterations': np.random.randint(300, 1000, size=25),
                'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
                'depth': [4, 6, 8, 10],
                'l2_leaf_reg': [1, 3, 5, 7, 9],
                'bagging_temperature': [0, 0.5, 1, 2],
                'random_strength': [1, 5, 10],
                'border_count': [32, 64, 128],
                'grow_policy': ["SymmetricTree", "Depthwise"],
                'loss_function': ["MultiClass"],
                'eval_metric': ["Accuracy"],
                'task_type': ["CPU"],
                'devices': ['0'],
                'verbose': [False]
                }

    param_LR = {"penalty": ["l1", "l2"],
                "C": [0.01, 0.1, 1.0, 10, 100],
                "solver": ["liblinear", "saga"],
                "max_iter": np.random.randint(300, 800, size=25)}

    param_ANN = {"hidden_layer_sizes": [(64,), (100,), (128,), (256,)],
                 "activation": ["logistic", "tanh", "relu"],
                 "solver": ["adam", "sgd", "lbfgs"],
                 "alpha": [0.0001, 0.001, 0.01, 0.1],
                 "max_iter": [800, 1000, 1500, 1700],
                 "learning_rate": ["constant", "adaptive", "invscaling"],
                 "early_stopping": [True],}

    models = {
        "CatBoost": (CatBoostClassifier(), param_CB, 50),
        "KNN": (KNeighborsClassifier(), param_KNN, 50),
        "LR": (LogisticRegression(), param_LR, 50),
        "ANN": (MLPClassifier(), param_ANN, 50),
        "LGBM": (LGBMClassifier(objective="multiclass", force_col_wise=True, verbose=-1), param_LGBM, 50),
        "SVM": (SVC(), param_SVC, 50),
        "RF": (RandomForestClassifier(), param_RF, 50),
        "EXTree": (ExtraTreesClassifier(verbose=0), param_ET, 50),
    }

    result = {"Model": [],
              "Feature": [],
              "Precision Test": [],
              "Recall Test": [],
              "F1-score Test": [],
              "Accuracy Test": [],
              "Best Param": []}

    for name, model in tqdm(models.items(), desc="Modeling"):
        keys, values = zip(*model[1].items())
        all_combinations = list(product(*values))

        all_combinations = random.sample(all_combinations, model[2])

        temp_results = []

        for i, v in tqdm(enumerate(all_combinations), desc=f"Random Search - {name}"):

            params = dict(zip(keys, v))

            if name == "CatBoost":
                model_instance = CatBoostClassifier()
            else:
                model_instance = model[0].set_params(**params)
            model_instance.fit(x_train, y_train)

            y_pre_test = model_instance.predict(x_test)

            temp_results.append({
                "Model": name,
                "Feature": fea_name,

                "Precision Test": precision_score(y_test, y_pre_test, average="weighted"),
                "Recall Test": recall_score(y_test, y_pre_test, average="weighted"),
                "F1-score Test": f1_score(y_test, y_pre_test, average="weighted"),
                "Accuracy Test": accuracy_score(y_test, y_pre_test),
                "Best Param": model_instance.get_params()
            })

        top_results = sorted(temp_results, key=lambda x: x["Accuracy Test"], reverse=True)[0]

        result["Model"].append(top_results["Model"])
        result["Feature"].append(top_results["Feature"])

        result["Precision Test"].append(top_results["Precision Test"])
        result["Recall Test"].append(top_results["Recall Test"])
        result["F1-score Test"].append(top_results["F1-score Test"])
        result["Accuracy Test"].append(top_results["Accuracy Test"])

        result["Best Param"].append(top_results["Best Param"])

    return pd.DataFrame(result)

In [None]:
for val in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:

    df = pd.read_csv(r"Fish_Eyes_Freshness_Classification\machine_learning\Swin\feature_extraction_Swin_Stage4.csv")

    df_train = df.loc[df["Type"] == "Train"]
    df_test = df.loc[df["Type"] == "Test"]

    new_df = df_train.copy()

    scaler = LabelEncoder()
    new_df["Label"] = scaler.fit_transform(new_df["Label"])

    x_train, y_train = new_df.drop(columns=["Label", "Path", "Type"]), new_df["Label"]

    cols = x_train.columns

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(x_train, y_train)

    feature_importance = rf.feature_importances_

    feature_importance_df = pd.DataFrame({"Feature": cols, "Importance": feature_importance})
    feature_importance_df.sort_values(by="Importance", ascending=False, inplace=True)

    feature_importance_df = feature_importance_df.reset_index(drop=True)

    feature = feature_importance_df.loc[feature_importance_df.index < val * len(feature_importance_df)]["Feature"].tolist()

    final_result = modeling(train_set=df_train, test_set=df_test, features=feature, label="Label", fea_name=f"{val} Feature Importance")

    path = r"Fish_Eyes_Freshness_Classification\machine_learning\Swin\result_feature_selection_Swin_Stage4_by_RF.csv"

    if os.path.exists(path):
        final_result.to_csv(path, index=False, mode='a', header=False)
    else:
        final_result.to_csv(path, index=False)