In [1]:
import autorootcwd

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [3]:
seed = 42
np.random.seed(seed)

In [4]:
# Load Datasets
datasets = {
    'iris': load_iris(return_X_y=True),
    'digits': load_digits(return_X_y=True),
    'wine': load_wine(return_X_y=True),
    'breast_cancer': load_breast_cancer(return_X_y=True)
}

In [5]:
models = {
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': list(range(2, 30)),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': list(range(100, 2000, 50)),
            'max_depth': list(range(10, 110, 10)),
            'min_samples_split': list(range(2, 10)),
            'min_samples_leaf': list(range(1, 10)),
            'bootstrap': [True, False]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': list(range(50, 1000, 50)),
            'max_depth': list(range(1, 10)),
            'learning_rate': np.linspace(0.01, 0.3, num=10).tolist(),
            'subsample': np.linspace(0.5, 1, num=10).tolist(),
            'colsample_bytree': np.linspace(0.5, 1, num=10).tolist()
        }
    }
}

In [9]:
results = []
best_results = []
for dataset_name, data in datasets.items():
    X, y = data
    
    for algorithm_name, algo in models.items():
        clf = RandomizedSearchCV(algo['model'], algo['params'], n_iter=50, cv=5, random_state=42, scoring='accuracy')
        clf.fit(X, y)
        
        # Get the results of the randomized search
        cv_results = clf.cv_results_
        
        for i in range(clf.n_iter):
            results.append({
                'dataset': dataset_name,
                'model': algorithm_name,
                'score': abs(round(cv_results['mean_test_score'][i], 4)),
                'hyperparameters': cv_results['params'][i],
                'iteration': i
            })
        
        # Save the best result from each fit
        best_results.append({
            'dataset': dataset_name,
            'model': algorithm_name,
            'score': abs(round(clf.best_score_, 4)),
            'hyperparameters': clf.best_params_,
            'iteration': np.argmax(cv_results['mean_test_score'])
        })
        
        print(f'Finished {algorithm_name} on {dataset_name}')

# Convert the results to a DataFrame for easier manipulation
df_history = pd.DataFrame(results)
df_best = pd.DataFrame(best_results)

# Save to file
df_history.to_csv('Wyniki/random-search-history.csv', index=False)
df_best.to_csv('Wyniki/random-search-best.csv', index=False)

Finished KNN on iris
Finished RandomForest on iris
Finished XGBoost on iris
Finished KNN on digits
Finished RandomForest on digits
Finished XGBoost on digits
Finished KNN on wine
Finished RandomForest on wine
Finished XGBoost on wine
Finished KNN on breast_cancer
Finished RandomForest on breast_cancer
Finished XGBoost on breast_cancer
