In [None]:
import numpy as np
import random
import copy
from tqdm.notebook import tqdm

from sklearn.model_selection import GridSearchCV
#estimators
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

from dataset_loader import load_dataset, preprocessing
from Boosting_models import MulticlassClassificationOvR, LogitBoost, MEBoost, AdaBoost, RUSBoost, GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import precision_score, recall_score, accuracy_score
from scipy.stats import hmean



def gmean_score(y_true, y_pred):
    return hmean([precision_score(y_true, y_pred, average='weighted'), recall_score(y_true, y_pred, average='weighted')])

def weighted_accuracy(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred, normalize=False)
    return acc / len(y_true)


def find_best_base_estimator(base_estimators, X_train, y_train):
    best_params = {}

    for name, estimator in base_estimators.items():
        param_grid = {}

        if name in ['DecisionTreeClassifier', 'ExtraTreeClassifier', 'DecisionTreeRegressor']:
            param_grid = {
                'max_depth': [3, 5, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2']
            }
        elif name == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2']
            }
        elif name == 'SVC':
            param_grid = {
                'C': [0.1, 1, 10, 100],
                'gamma': ['scale', 'auto'],
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
            }
        else:
            continue
        skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
        grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=skf, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_params[name] = grid_search.best_params_



        # Output of model quality metrics
        cv_results = grid_search.cv_results_
        mean_test_score = cv_results['mean_test_score']
        params = cv_results['params']

        # print("Metrics for", name)
        with open('results_gs.txt', 'a') as f:
            f.write(f"Metrics for {name}\n")

        for mean_score, param in zip(mean_test_score, params):
            # print("Mean accuracy:", mean_score, "Params:", param)
            with open('results_gs.txt', 'a') as f:
                f.write(f"Mean accuracy: {mean_score}, Params:, {param}\n")

    return best_params

def main():
    dataset_names = ['Wine', 'Hayes_Roth', 'Contraceptive_Method_Choice',
                    'Pen-Based_Recognition_of_Handwritten_Digits',
                    'Vertebral_Column', 'Differentiated_Thyroid_Cancer_Recurrence',
                    'Dermatology', 'Balance_Scale', 'Glass_Identification',
                    'Heart_Disease', 'Car_Evaluation', 'Thyroid_Disease', 'Yeast',
                     'Page_Blocks_Classification', 'Statlog_Shuttle', 'Covertype'
    ]

    base_estimators = {
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'LogisticRegression': LogisticRegression(),
        'SVC': SVC()
    }

    boosting_models = {
        'AdaBoost': AdaBoost(),
        'RUSBoost': RUSBoost(),
        # 'XGBClassifier': XGBClassifier(),
        # 'CatBoostClassifier': CatBoostClassifier(),
        # 'SMOTE': SMOTE(),
        # 'LogitBoost': LogitBoost(),
        # 'GradientBoostingClassifier': GradientBoostingClassifier(),
        # 'MEBoost': MEBoost()
    }

    for dataset_name in dataset_names:
        with open('metrics_on_ds.txt', 'a') as f:
            f.write(f"Metrics for {dataset_name}\n")
        X, y = load_dataset(dataset_name)
        X_train, X_test, y_train, y_test = preprocessing(X, y, scaler=StandardScaler(), test_size=0.3, random_state=42)
        best_params = find_best_base_estimator(base_estimators, X_train, y_train)

        # Training boosting models on base models with the best parameters
        for base_model, params in best_params.items():
            base_estimator = eval(base_model)(**params)
            for boosting_name, boosting_model in boosting_models.items():
                model = eval(boosting_name)(base_estimator=base_estimator, n_estimators=50)
                boosting_model = MulticlassClassificationOvR(model)
                boosting_model.fit(X_train, y_train)

                print("Boosting model with base estimator", base_model, "trained.")
                y_pred = boosting_model.predict(X_test)
                print('gmean_score:', gmean_score(y_test, y_pred))
                print('weighted_accuracy:', weighted_accuracy(y_test, y_pred))
                with open('metrics_on_ds.txt', 'a') as f:
                    f.write(f"Boosting model with base estimator: {base_model}")
                    f.write(f"gmean_score: {gmean_score(y_test, y_pred)}\n")
                    f.write(f"weighted_accuracy: {weighted_accuracy(y_test, y_pred)}\n")


if __name__ == "__main__":
    main()