In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [3]:
import ray
from ray import tune
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import ast
# Assuming your data is in X and y dataframes

def train_ensemble(config):
    # Split data into training and validation sets
    vars = ast.literal_eval(config['colset'])
    Xloc = X[vars]

    X_train, X_val, y_train, y_val = train_test_split(Xloc, y, test_size=0.2)
    
    # Initialize the models with the given hyperparameters
    xgb_model = XGBClassifier(
        eta=config["xgb_eta"],
        max_depth=config["xgb_max_depth"],
        min_child_weight=config["xgb_min_child_weight"],
        subsample=config["xgb_subsample"],
        colsample_bytree=config["xgb_colsample_bytree"],
        reg_lambda=config["xgb_lambda"],
        reg_alpha=config["xgb_alpha"],
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        verbosity=0
    )

    et_model = ExtraTreesClassifier(
        n_estimators=config["et_n_estimators"],
        max_depth=config["et_max_depth"],
        min_samples_split=config["et_min_samples_split"],
        min_samples_leaf=config["et_min_samples_leaf"],
        max_features=config["et_max_features"],
        bootstrap=config["et_bootstrap"]
    )
    
    svc_model = SVC(
        C=config["svc_C"],
        kernel=config["svc_kernel"],
        probability=True
    )

    lr_model = LogisticRegression(
        C=config["lr_C"],
        solver=config["lr_solver"],
        max_iter=1000
    )
    
    knn_model = KNeighborsClassifier(
        n_neighbors=config["knn_n_neighbors"],
        weights=config["knn_weights"],
        algorithm=config["knn_algorithm"]
    )

    # # Train the models
    # xgb_model.fit(X_train, y_train)
    # et_model.fit(X_train, y_train)
    # svc_model.fit(X_train, y_train)
    # lr_model.fit(X_train, y_train)
    # knn_model.fit(X_train, y_train)
    
    # # Predict probabilities on the validation set
    # xgb_preds_proba = xgb_model.predict_proba(X_val)[:, 1]
    # et_preds_proba = et_model.predict_proba(X_val)[:, 1]
    # svc_preds_proba = svc_model.predict_proba(X_val)[:, 1]
    # lr_preds_proba = lr_model.predict_proba(X_val)[:, 1]
    # knn_preds_proba = knn_model.predict_proba(X_val)[:, 1]
    # Create the ensemble model
    ensemble_model = VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            ('et', et_model),
            ('svc', svc_model),
            ('lr', lr_model),
            ('knn', knn_model)
        ],
        voting='soft',
        weights=[
            config["weight_xgb"],
            config["weight_et"],
            config["weight_svc"],
            config["weight_lr"],
            config["weight_knn"]
        ]
    )
    
    # Train the ensemble model
    ensemble_model.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    ensemble_preds_proba = ensemble_model.predict_proba(X_val)[:, 1]
    # Ensemble the predictions by averaging
    #ensemble_preds_proba = (xgb_preds_proba + et_preds_proba + svc_preds_proba + lr_preds_proba + knn_preds_proba) / 5
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(ensemble_preds_proba)[-int(0.2 * len(ensemble_preds_proba)):]

    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices].values
    top_20_preds = (ensemble_preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate the number of correct class 1 predictions
    correct_class_1_predictions = sum(1 for true, pred in zip(top_20_true, top_20_preds) if true == 1 and pred == 1)
    
    # Calculate the number of features used
    num_features_used = Xloc.shape[1]
    
    #scaled score calculation
    customer_scaled = (correct_class_1_predictions / len(top_20_preds)) * 1000
    customer_gain = 10 * customer_scaled
    variable_cost = 200 * num_features_used
    custom_score = customer_gain - variable_cost
    # # Compute the custom score
    # custom_score = (correct_class_1_predictions * 10) - (num_features_used * 200)
    
    report({"custom_score": custom_score})

# Define the search space using ConfigSpace
config_space = CS.ConfigurationSpace()

# Hyperparameters for XGBoost
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_eta', lower=0.01, upper=0.1, log=True))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_max_depth', lower=3, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_min_child_weight', lower=1, upper=5))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_subsample', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_colsample_bytree', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_lambda', lower=1e-3, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_alpha', lower=1e-3, upper=10.0, log=True))

# Hyperparameters for ExstraTrees
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_n_estimators', lower=50, upper=200))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_max_depth', lower=3, upper=20))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_split', lower=2, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_leaf', lower=1, upper=10))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_max_features', ['sqrt', 'log2']))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_bootstrap', [True, False]))

# Hyperparameters for SVC
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('svc_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('svc_kernel', ['linear', 'rbf', 'poly', 'sigmoid']))

# Hyperparameters for Logistic Regression
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('lr_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('lr_solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']))

# Hyperparameters for KNN
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('knn_n_neighbors', lower=1, upper=30))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('knn_weights', ['uniform', 'distance']))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('knn_algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']))

# Hyperparameters for model weights
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_xgb', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_et', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_svc', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_lr', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_knn', lower=0.0, upper=1.0))


config_space.add_hyperparameter(CSH.CategoricalHyperparameter('colset', ['[101, 102, 103, 105]', '[101, 102, 103]', '[101, 102, 105]', '[101, 103, 105]', '[102, 103, 105]', '[104, 102, 103, 105]', '[100, 101, 102, 103, 105]', '[100, 101, 102, 103]', '[100, 101, 102, 105]', '[100, 101, 103, 105]', '[100, 102, 103, 105]', '[100, 104, 102, 103, 105]', '[100, 101, 102, 103, 104, 105]', '[100, 101]', '[100, 102]', '[100, 103]', '[100, 104]', '[100, 105]','[101, 102]', '[101, 103]', '[101, 104]', '[101, 105]', '[102, 103]', '[102, 104]', '[102, 105]', '[103, 104]', '[103, 105]', '[104, 105]',]))


# Set up the BOHB search algorithm
bohb_search = TuneBOHB(config_space, metric="custom_score", mode="max")

# Set up the HyperBandForBOHB scheduler
bohb_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    metric="custom_score",
    mode="max"
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_ensemble,
    resources_per_trial={"cpu": 1, "gpu": 0},
    search_alg=bohb_search,
    scheduler=bohb_scheduler,
    num_samples=600,
    trial_dirname_creator=trial_dirname_creator,
    raise_on_failed_trial=False
)

# Get the best result
best_config = analysis.get_best_config(metric="custom_score", mode="max")
print("Best config: ", best_config)


2024-05-29 13:54:29,689	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-29 13:54:31,262	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-05-29 13:54:31,264	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-29 14:13:43
Running for:,00:19:12.30
Memory:,26.3/63.8 GiB

Trial name,status,loc,colset,et_bootstrap,et_max_depth,et_max_features,et_min_samples_leaf,et_min_samples_split,et_n_estimators,knn_algorithm,knn_n_neighbors,knn_weights,lr_C,lr_solver,svc_C,svc_kernel,weight_et,weight_knn,weight_lr,weight_svc,weight_xgb,xgb_alpha,xgb_colsample_bytree,xgb_eta,xgb_lambda,xgb_max_depth,xgb_min_child_weight,xgb_subsample,iter,total time (s),custom_score
train_ensemble_580f61ef,TERMINATED,127.0.0.1:44456,"[103, 105]",False,16,log2,8,9,173,auto,12,uniform,1.40943,liblinear,4.91158,rbf,0.124633,0.513567,0.678996,0.782806,0.436227,0.0168649,0.731459,0.012894,9.98652,8,1,0.745696,1,1.60162,7050
train_ensemble_036cf302,TERMINATED,127.0.0.1:52832,"[100, 104]",True,3,sqrt,3,5,51,brute,15,uniform,0.247422,sag,7.90703,poly,0.877442,0.781652,0.860577,0.241999,0.448318,1.59079,0.966791,0.0997166,0.164308,9,4,0.974796,1,3.27452,6100
train_ensemble_d13fac4d,TERMINATED,127.0.0.1:3732,"[102, 104]",False,13,log2,6,8,186,kd_tree,12,uniform,0.941172,sag,0.902339,rbf,0.302062,0.489341,0.0791242,0.234006,0.349887,0.0957057,0.604548,0.0123208,0.00349171,6,3,0.541237,1,1.62876,6250
train_ensemble_3a609c70,TERMINATED,127.0.0.1:58708,"[101, 102, 103]",False,16,log2,7,4,147,brute,4,uniform,0.114854,newton-cg,0.403742,rbf,0.0614045,0.192684,0.998469,0.369417,0.0899971,0.0114147,0.962211,0.054856,0.156678,8,4,0.981118,1,1.41518,6650
train_ensemble_febb5eb1,TERMINATED,127.0.0.1:33956,"[101, 102]",False,9,log2,9,2,182,ball_tree,20,uniform,0.398301,lbfgs,0.343536,sigmoid,0.281317,0.0810822,0.0418105,0.659577,0.0398462,0.0164286,0.597885,0.0659242,0.00627504,4,2,0.619653,1,1.29131,6250
train_ensemble_ad6c286f,TERMINATED,127.0.0.1:34780,"[101, 103]",False,17,sqrt,8,2,189,kd_tree,27,uniform,6.12055,liblinear,0.129609,poly,0.599395,0.820314,0.823695,0.716818,0.0289465,0.175604,0.63948,0.0469068,0.00668484,4,5,0.652027,1,1.09877,6100
train_ensemble_26cbf427,TERMINATED,127.0.0.1:11284,"[100, 105]",False,8,log2,3,2,171,brute,10,distance,9.4872,newton-cg,0.813695,linear,0.774289,0.00515306,0.226055,0.884913,0.885401,0.0867614,0.626426,0.0388877,0.0382342,3,4,0.749976,1,0.964031,6450
train_ensemble_29cfd8c6,TERMINATED,127.0.0.1:10884,"[101, 102, 105]",True,14,sqrt,8,10,99,brute,25,distance,3.49059,liblinear,0.235144,linear,0.0781865,0.106953,0.199473,0.2554,0.666534,0.410475,0.58669,0.0302092,2.11744,9,4,0.595309,1,0.930518,6800
train_ensemble_2102e39e,TERMINATED,127.0.0.1:15532,"[100, 101, 103, 105]",False,9,sqrt,4,2,108,auto,12,distance,2.92342,sag,8.2871,rbf,0.140804,0.434253,0.389149,0.92056,0.985782,0.00914254,0.762909,0.015004,0.0100917,8,4,0.795443,1,1.72201,6400
train_ensemble_f63b42b4,TERMINATED,127.0.0.1:34696,"[100, 102]",True,11,sqrt,6,10,173,kd_tree,2,uniform,1.42622,sag,0.885493,rbf,0.0259448,0.203138,0.528702,0.0450424,0.95173,0.00975783,0.974354,0.0281888,0.00159932,6,4,0.626671,1,1.6402,6500




[36m(train_ensemble pid=9036)[0m   y = column_or_1d(y, warn=True)
[36m(train_ensemble pid=9036)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Trial name,custom_score
train_ensemble_0046cb65,7050
train_ensemble_004f7eeb,6800
train_ensemble_00a6f292,6400
train_ensemble_00f42c4a,5450
train_ensemble_01671cbc,6250
train_ensemble_01a50972,6900
train_ensemble_02451f4a,5850
train_ensemble_029cca57,6250
train_ensemble_02fb07a0,6200
train_ensemble_036cf302,6100


[36m(train_ensemble pid=58708)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=58708)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=10144)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=10144)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=34696)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=34696)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=22560)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=22560)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=9264)[0m  

Best config:  {'colset': '[101, 102, 105]', 'et_bootstrap': False, 'et_max_depth': 12, 'et_max_features': 'sqrt', 'et_min_samples_leaf': 10, 'et_min_samples_split': 3, 'et_n_estimators': 108, 'knn_algorithm': 'brute', 'knn_n_neighbors': 27, 'knn_weights': 'uniform', 'lr_C': 4.514984819282504, 'lr_solver': 'sag', 'svc_C': 0.1884652734305775, 'svc_kernel': 'rbf', 'weight_et': 0.7496984957246486, 'weight_knn': 0.6635888019981687, 'weight_lr': 0.011412155873019136, 'weight_svc': 0.4846194549815259, 'weight_xgb': 0.41604852070874554, 'xgb_alpha': 0.1331655591104277, 'xgb_colsample_bytree': 0.7382259102923003, 'xgb_eta': 0.027781866578049316, 'xgb_lambda': 0.052031232944512805, 'xgb_max_depth': 8, 'xgb_min_child_weight': 2, 'xgb_subsample': 0.8489255131045501}


In [4]:
df=analysis.dataframe()
df.to_csv('ensemble2-raytune-bohb.csv')

In [5]:
df.sort_values(by='custom_score', ascending=False, inplace=True)
print(df.head())

     custom_score   timestamp checkpoint_dir_name   done  training_iteration  \
20         7650.0  1716983836                None  False                   1   
66         7400.0  1716983861                None  False                   1   
424        7350.0  1716984493                None  False                   1   
395        7250.0  1716984438                None  False                   1   
61         7250.0  1716983858                None  False                   1   

     trial_id                 date  time_this_iter_s  time_total_s    pid  \
20   17fe6e77  2024-05-29_13-57-16          1.404000      1.404000  28064   
66   798f3dcc  2024-05-29_13-57-41          1.047000      1.047000  58104   
424  c4364fe3  2024-05-29_14-08-13          1.684004      1.684004  50052   
395  da97b380  2024-05-29_14-07-18          1.364893      1.364893  56120   
61   4c35c736  2024-05-29_13-57-38          1.458998      1.458998  32964   

     ... config/weight_svc config/weight_xgb  config/xgb