In [None]:
# Importy
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import set_config
from sklearn.model_selection import RandomizedSearchCV
from smac import HyperparameterOptimizationFacade, Scenario
from ConfigSpace import Configuration, ConfigurationSpace
from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, CategoricalHyperparameter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from Utils.dataManagingUtils import fetch_openml_dataset, prepare_data, save_data_to_csv, create_randomForest_tuning_hisotry_object, create_randomForest_allDatasets_tuning_hisotry_object, create_comparison_object
from Utils.pipelineUtils import create_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
set_config(transform_output = "pandas")

In [None]:
# Definicja stałych użytych w metodach samplingu 
n_jobs=-1
n_iter_randomSearch=450  # okolo 1h
n_iter_bayes=150 # okolo 3h 
verbose=1
random_state=42
cv=5

# Wartości hiperparametrów
n_estimators_lower = 10
n_estimators_upper = 230
max_depth_lower = 1
max_depth_upper = 15
min_samples_split_lower = 1
min_samples_split_upper = 15
min_samples_leaf_lower = 1
min_samples_leaf_upper = 10
max_features = ['sqrt', 'log2']

In [None]:
# Pobieranie danych
m_t_beforePrepare = fetch_openml_dataset(44116)
o_s_beforePrepare = fetch_openml_dataset(45560)
b_m_beforePrepare = fetch_openml_dataset(44126)
c_d_beforePrepare = fetch_openml_dataset(45024)

In [None]:
# Obróbka Danych 
m_t_X, m_t_y = prepare_data(m_t_beforePrepare, 'class')
o_s_X, o_s_y = prepare_data(o_s_beforePrepare, 'Revenue')
b_m_X, b_m_y = prepare_data(b_m_beforePrepare, 'Class')
c_d_X, c_d_y = prepare_data(c_d_beforePrepare, 'SeriousDlqin2yrs')

In [None]:
# Podział Danych 
m_t_X_train,   m_t_X_test, m_t_y_train, m_t_y_test = train_test_split(m_t_X , m_t_y,test_size=0.2)
o_s_X_train,   o_s_X_test, o_s_y_train, o_s_y_test = train_test_split(o_s_X, o_s_y,test_size=0.2)
b_m_X_train,   b_m_X_test, b_m_y_train, b_m_y_test = train_test_split(b_m_X, b_m_y,test_size=0.2)
c_d_X_train,   c_d_X_test, c_d_y_train, c_d_y_test = train_test_split(c_d_X, c_d_y,test_size=0.2)

X_train_list = [ m_t_X_train, o_s_X_train, b_m_X_train, c_d_X_train]
y_train_list = [ m_t_y_train, o_s_y_train, b_m_y_train, c_d_y_train]
X_test_list = [ m_t_X_test, o_s_X_test, b_m_X_test, c_d_X_test]
y_test_list = [ m_t_y_test, o_s_y_test, b_m_y_test, c_d_y_test]
file_names = ["m_t_tuning_history", "o_s_tuning_history", "b_m_tuning_history","c_d_tuning_history"]

In [None]:
# Definicja tabel służących do przechowywania informacji o histori tuningu dla random Search, biorąc pod uwagę wszystkie datasety
allDatasets_randomSearch_tunability_history = create_randomForest_allDatasets_tuning_hisotry_object()
helper = pd.DataFrame()

In [None]:
# RandomForestClassifier optymalizacja hiperparametrow uzywajac RandomizedSearchCV
param_space = {
    'model__n_estimators': np.linspace(n_estimators_lower, n_estimators_upper, n_estimators_upper - n_estimators_lower).astype(int),
    'model__max_depth': np.linspace(max_depth_lower, max_depth_upper, max_depth_upper- max_depth_lower).astype(int),
    'model__min_samples_split': np.linspace(min_samples_split_lower, min_samples_split_upper, min_samples_split_upper - min_samples_split_lower).astype(int),
    'model__min_samples_leaf': np.linspace(min_samples_leaf_lower, min_samples_leaf_upper, min_samples_leaf_upper - min_samples_leaf_lower).astype(int),
    'model__max_features': max_features 
}

randomForestClassifier_pipeline = Pipeline([
    ('preprocessing', create_column_transformer()),
    ('model', RandomForestClassifier())
])

random_search = RandomizedSearchCV(
    estimator=randomForestClassifier_pipeline,
    param_distributions=param_space,
    n_jobs=n_jobs,
    n_iter=n_iter_randomSearch,
    verbose=verbose,
    random_state=random_state,
    cv=cv,
    scoring='roc_auc'
    )


for file_name, X_train, y_train in zip(file_names, X_train_list, y_train_list):
    random_search.fit(X_train, y_train)
    cs_results = pd.DataFrame(random_search.cv_results_)
    helper[f'{file_name}']=cs_results['mean_test_score']
    save_data_to_csv(cs_results, f"../Wyniki/RandomForestClassifier/RandomSearch/{file_name}.csv")

In [None]:
# Wyliczenie optymalnych hiperparametrow dla random search
mean_column = helper.mean(axis=1)
std_column = helper.std(axis=1)

for i in range(len(cs_results)):
    allDatasets_randomSearch_tunability_history['param_model__n_estimators'].append(cs_results['param_model__n_estimators'][i])
    allDatasets_randomSearch_tunability_history['param_model__min_samples_split'].append(cs_results['param_model__min_samples_split'][i])
    allDatasets_randomSearch_tunability_history['param_model__min_samples_leaf'].append(cs_results['param_model__min_samples_leaf'][i])
    allDatasets_randomSearch_tunability_history['param_model__max_features'].append(cs_results['param_model__max_features'][i])
    allDatasets_randomSearch_tunability_history['param_model__max_depth'].append(cs_results['param_model__max_depth'][i])
    allDatasets_randomSearch_tunability_history['mean_all_datasets_test_score'].append(mean_column[i])
    allDatasets_randomSearch_tunability_history['std_all_datasets_test_score'].append(std_column[i])

In [None]:
# Wyświetlenie optymalnych hiperparametrów dla Random Search
allDatasets_randomSearch_tunability_history = pd.DataFrame(allDatasets_randomSearch_tunability_history)
max_value = allDatasets_randomSearch_tunability_history['mean_all_datasets_test_score'].max()
randomSearch_bestHyperParameter = allDatasets_randomSearch_tunability_history[allDatasets_randomSearch_tunability_history['mean_all_datasets_test_score'] == max_value]
print("Optymalne hiperparametry:", randomSearch_bestHyperParameter)

In [None]:
# Zapis do plików historii tuningu dla Random Search 
save_data_to_csv(allDatasets_randomSearch_tunability_history, f"../Wyniki/RandomForestClassifier/RandomSearch/allDatasets_tunability_history.csv")

In [None]:
# Definicja tabel służących do przechowywania informacji o histori tuningu dla Bayes Optimization
m_t_tunability_history = create_randomForest_tuning_hisotry_object(cv)
o_s_tunability_history = create_randomForest_tuning_hisotry_object(cv)
b_m_tunability_history = create_randomForest_tuning_hisotry_object(cv)
c_d_tunability_history = create_randomForest_tuning_hisotry_object(cv)
tunability_hisotry_list = [m_t_tunability_history,o_s_tunability_history,b_m_tunability_history, c_d_tunability_history]

# Definicja tabel służących do przechowywania informacji o histori tuningu dla Bayes Optimization, biorąc pod uwagę wszystkie datasety
allDatasets_BO_tunability_history = create_randomForest_allDatasets_tuning_hisotry_object()

In [None]:
# Definicja tabel służących do przechowywania informacji o historii tuningu dla Bayes Optimization
def randomForest_objective_function(config: Configuration, seed :int):
    all_scores = []
    n_estimators = config['n_estimators']
    max_depth = config['max_depth']
    min_samples_split = config['min_samples_split']
    min_samples_leaf = config['min_samples_leaf']
    max_features = config['max_features']
    print(f"n_estimators:{n_estimators}, max_depth:{max_depth}, min_samples_split:{min_samples_split}, min_samples_leaf:{min_samples_leaf}, max_features:{max_features}\n")
    
    allDatasets_BO_tunability_history['param_model__n_estimators'].append(n_estimators)
    allDatasets_BO_tunability_history['param_model__min_samples_split'].append(min_samples_split)  
    allDatasets_BO_tunability_history['param_model__min_samples_leaf'].append(min_samples_leaf)  
    allDatasets_BO_tunability_history['param_model__max_features'].append(max_features)  
    allDatasets_BO_tunability_history['param_model__max_depth'].append(max_depth)

    for X_train, y_train, tunability_hisotry in zip(X_train_list, y_train_list, tunability_hisotry_list):
        tunability_hisotry['param_model__n_estimators'].append(n_estimators)
        tunability_hisotry['param_model__min_samples_split'].append(min_samples_split)  
        tunability_hisotry['param_model__min_samples_leaf'].append(min_samples_leaf)  
        tunability_hisotry['param_model__max_features'].append(max_features)  
        tunability_hisotry['param_model__max_depth'].append(max_depth) 

        randomForestClassifier_pipeline = Pipeline([
            ('preprocessing', create_column_transformer()),
            ('model', RandomForestClassifier(n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             min_samples_split=min_samples_split,
                                             min_samples_leaf=min_samples_leaf, 
                                             max_features=max_features,
                                             random_state=seed))])
        scores = cross_val_score(randomForestClassifier_pipeline, X_train, y_train, scoring='roc_auc', cv=cv)
        for i in range(cv):
            tunability_hisotry[f'split{i}_test_score'].append(scores[i])
        tunability_hisotry['mean_test_score'].append(np.mean(scores))  
        tunability_hisotry['std_test_score'].append(np.std(scores)) 
        all_scores.extend(scores)
    
    all_scores_mean = np.mean(all_scores)
    all_scores_std = np.std(all_scores)
    allDatasets_BO_tunability_history['mean_all_datasets_test_score'].append(all_scores_mean)  
    allDatasets_BO_tunability_history['std_all_datasets_test_score'].append(all_scores_std)    
    print(f"all_scores_mean:{all_scores_mean}, all_scores_std:{all_scores_std}\n")
    return -all_scores_mean

configspace = ConfigurationSpace()
configspace.add_hyperparameter(UniformIntegerHyperparameter('n_estimators', lower=n_estimators_lower, upper=n_estimators_upper))
configspace.add_hyperparameter(UniformIntegerHyperparameter('max_depth', lower= max_depth_lower, upper=max_depth_upper))
configspace.add_hyperparameter(UniformIntegerHyperparameter('min_samples_split', lower=min_samples_split_lower, upper=min_samples_split_upper))
configspace.add_hyperparameter(UniformIntegerHyperparameter('min_samples_leaf', lower=min_samples_leaf_lower, upper=min_samples_leaf_upper))
configspace.add_hyperparameter(CategoricalHyperparameter('max_features',max_features))

default_cfg = configspace.get_default_configuration()
scenario = Scenario(configspace, deterministic=True, n_trials=n_iter_bayes)
smac = HyperparameterOptimizationFacade(scenario, randomForest_objective_function)
bayesOptimization_bestHyperParameter = smac.optimize()

In [None]:
# Wyświetlenie optymalnych hiperparametrów dla Bayes Optimization
print("Optymalne hiperparametry:", bayesOptimization_bestHyperParameter)

In [None]:
# Zapis do plików historii tuningu dla Bayes Optimization
for name, tunability_hisotry in zip(file_names, tunability_hisotry_list):
    save_data_to_csv(pd.DataFrame(tunability_hisotry),
                     f"../Wyniki/RandomForestClassifier/BayesOptimization/{name}.csv")
    
save_data_to_csv(pd.DataFrame(allDatasets_BO_tunability_history), f"../Wyniki/RandomForestClassifier/BayesOptimization/allDatasets_tunability_history.csv")

In [None]:
# Porownanie modeli dla hiperparametorw defaultowych, znaleznionych poprzez RandomSearch oraz Bayes Optimization
randomSearch_Model = Pipeline([
            ('preprocessing', create_column_transformer()),
            ('model', RandomForestClassifier(n_estimators=randomSearch_bestHyperParameter['param_model__n_estimators'].values[0],
                                             max_depth=randomSearch_bestHyperParameter['param_model__max_depth'].values[0],
                                             min_samples_split=randomSearch_bestHyperParameter['param_model__min_samples_split'].values[0],
                                             min_samples_leaf=randomSearch_bestHyperParameter['param_model__min_samples_leaf'].values[0], 
                                             max_features=randomSearch_bestHyperParameter['param_model__max_features'].values[0],
                                             random_state=random_state))])

bayesOptimization_Model = Pipeline([
            ('preprocessing', create_column_transformer()),
            ('model', RandomForestClassifier(n_estimators=bayesOptimization_bestHyperParameter['n_estimators'],
                                             max_depth=bayesOptimization_bestHyperParameter['max_depth'],
                                             min_samples_split=bayesOptimization_bestHyperParameter['min_samples_split'],
                                             min_samples_leaf=bayesOptimization_bestHyperParameter['min_samples_leaf'], 
                                             max_features=bayesOptimization_bestHyperParameter['max_features'],
                                             random_state=random_state))])

default_Model = Pipeline([
            ('preprocessing', create_column_transformer()),
            ('model', RandomForestClassifier(random_state=random_state))])

comparison = create_comparison_object()

for X_train, y_train, X_test, y_test, filename in zip( X_train_list, y_train_list, X_test_list, y_test_list, file_names):
    randomSearch_Model.fit(X_train, y_train)
    bayesOptimization_Model.fit(X_train, y_train)
    default_Model.fit(X_train, y_train)

    randomSearch_Model_test_pred = randomSearch_Model.predict_proba(X_test)
    bayesOptimization_Model_test_pred = bayesOptimization_Model.predict_proba(X_test)
    default_Model_test_pred = default_Model.predict_proba(X_test)

    randomSearch_Model_auc_test = roc_auc_score(y_test, randomSearch_Model_test_pred[:,1])
    bayesOptimization_Model_auc_test = roc_auc_score(y_test, bayesOptimization_Model_test_pred[:,1])
    default_Model_auc_test = roc_auc_score(y_test, default_Model_test_pred[:,1])
    
    comparison['dataset'].append(filename) 
    comparison['randomSearch_Model_auc_test'].append(randomSearch_Model_auc_test)
    comparison['bayesOptimization_Model_auc_test'].append(bayesOptimization_Model_auc_test)  
    comparison['default_Model_auc_test'].append(default_Model_auc_test)

    print(f'filename: {filename}\n')
    print(f'randomSearch_Model_auc_test: {randomSearch_Model_auc_test}, bayesOptimization_Model_auc_test: {bayesOptimization_Model_auc_test}, default_Model_auc_test: {default_Model_auc_test}\n')

save_data_to_csv(pd.DataFrame(comparison), f"../Wyniki/RandomForestClassifier/sampling_comparison.csv")
