In [4]:
import os
import json
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, recall_score, accuracy_score
from skopt.space import Real, Integer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.model_selection import KFold, cross_val_predict
from skopt import BayesSearchCV

# Define the directory to save the files
save_dir = 'model_selection'

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Get current date and hour
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Define the filename
filename = f"{save_dir}/{current_time}.json"

# Define the ranges for hyperparameters
param_ranges = {
    'RandomForest': {
        'n_estimators': Integer(3, 200),
        'max_depth': Integer(3, 20),
        'min_samples_split': Integer(2, 50)
    },
    'XGBoost': {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
        'subsample': Real(0.5, 1.0)
    },
    'XGBRF': {
        'n_estimators': Integer(3, 200),
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.1, prior='log-uniform'),
        'subsample': Real(0.5, 1.0)
    }
}

# Define a function to perform model search and save selected features
def perform_model_search(models, X, y):
    best_results = {}
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    recall_scorer = make_scorer(recall_score)
    accuracy_scorer = make_scorer(accuracy_score)
    
    for name, model_info in models.items():
        model = model_info['model']
        params = model_info['params']
        
        # Bayesian optimization with cross-validation
        opt = BayesSearchCV(
            estimator=model,
            search_spaces=params,
            scoring=recall_scorer,
            cv=kf,
            n_iter=30,
            n_jobs=-1,
            random_state=42
        )
        
        opt.fit(X_train, y_train)
        
        # Calculate recall using cross-validation
        recall_scores = cross_val_predict(opt.best_estimator_, X_train, y_train, cv=kf, method='predict', n_jobs=-1)
        recall = recall_score(y_train, recall_scores)
        
        # Calculate accuracy using cross-validation
        accuracy_scores = cross_val_predict(opt.best_estimator_, X_train, y_train, cv=kf, method='predict', n_jobs=-1)
        accuracy = accuracy_score(y_train, accuracy_scores)
        
        # Evaluate model on test set
        y_test_pred = cross_val_predict(opt.best_estimator_, X_test, y_test, cv=kf, method='predict', n_jobs=-1)
        
        
        test_recall = recall_score(y_test, y_test_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        best_results[name] = {
            'best_score': opt.best_score_,
            'best_params': opt.best_params_,
            'recall': recall,
            'accuracy': accuracy,
            'test_recall': test_recall,
            'test_accuracy': test_accuracy
        }
    
    return best_results

# Define the models
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': param_ranges['RandomForest']
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': param_ranges['XGBoost']
    },
    'XGBRF': {
        'model': XGBRFClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': param_ranges['XGBRF']
    }
}

# Dummy dataset (replace with your actual dataset)
X = np.loadtxt('../data/x_train.txt', delimiter=' ')
X = X[:, [100, 101, 102, 105]]
y = np.loadtxt('../data/y_train.txt', delimiter=' ')

# Perform model search
best_results = perform_model_search(models, X, y)

# Print results
for model_name, result in best_results.items():
    print(f"Model: {model_name}")
    print(f"Best Score: {result['best_score']}")
    print(f"Best Params: {result['best_params']}")
    print(f"Train Recall: {result['recall']}")
    print(f"Train Accuracy: {result['accuracy']}")
    print(f"Test Recall: {result['test_recall']}")
    print(f"Test Accuracy: {result['test_accuracy']}")
    print('-' * 30)

Model: RandomForest
Best Score: 0.6462381266733763
Best Params: OrderedDict([('max_depth', 20), ('min_samples_split', 38), ('n_estimators', 200)])
Train Recall: 0.636317907444668
Train Accuracy: 0.655
Test Recall: 0.6633858267716536
Test Accuracy: 0.656
------------------------------
Model: XGBoost
Best Score: 0.6235251089983328
Best Params: OrderedDict([('learning_rate', 0.05269226912814583), ('max_depth', 3), ('n_estimators', 67), ('subsample', 0.5)])
Train Recall: 0.6237424547283702
Train Accuracy: 0.6515
Test Recall: 0.6299212598425197
Test Accuracy: 0.656
------------------------------
Model: XGBRF
Best Score: 0.5571483001249838
Best Params: OrderedDict([('learning_rate', 0.1), ('max_depth', 9), ('n_estimators', 193), ('subsample', 0.5)])
Train Recall: 0.5563380281690141
Train Accuracy: 0.64075
Test Recall: 0.7007874015748031
Test Accuracy: 0.622
------------------------------


In [5]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


# Save the results to a file
with open(filename, 'w') as f:
    json.dump(best_results, f, indent=4, cls=NpEncoder)