In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [2]:
import ray
from ray import tune
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import ast
# Assuming your data is in X and y dataframes

def train_ensemble(config):
    vars = ast.literal_eval(config['colset'])
    Xloc = X[vars]
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xloc, y, test_size=0.2)
   
    
    # Initialize the models with the given hyperparameters
    xgb_model = XGBClassifier(
        eta=config["xgb_eta"],
        max_depth=config["xgb_max_depth"],
        min_child_weight=config["xgb_min_child_weight"],
        subsample=config["xgb_subsample"],
        colsample_bytree=config["xgb_colsample_bytree"],
        reg_lambda=config["xgb_lambda"],
        reg_alpha=config["xgb_alpha"],
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        verbosity=0
    )

    et_model = ExtraTreesClassifier(
        n_estimators=config["et_n_estimators"],
        max_depth=config["et_max_depth"],
        min_samples_split=config["et_min_samples_split"],
        min_samples_leaf=config["et_min_samples_leaf"],
        max_features=config["et_max_features"],
        bootstrap=config["et_bootstrap"]
    )
    
    svc_model = SVC(
        C=config["svc_C"],
        kernel=config["svc_kernel"],
        probability=True
    )

    lr_model = LogisticRegression(
        C=config["lr_C"],
        solver=config["lr_solver"],
        max_iter=1000
    )
    ensemble_model = VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            ('et', et_model),
            ('svc', svc_model),
            ('lr', lr_model)
        ],
        voting='soft',
        weights=[
            config["weight_xgb"],
            config["weight_et"],
            config["weight_svc"],
            config["weight_lr"]
        ]
    )
    # Train the models
    # xgb_model.fit(X_train, y_train)
    # et_model.fit(X_train, y_train)
    # svc_model.fit(X_train, y_train)
    # lr_model.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    # xgb_preds_proba = xgb_model.predict_proba(X_val)[:, 1]
    # et_preds_proba = et_model.predict_proba(X_val)[:, 1]
    # svc_preds_proba = svc_model.predict_proba(X_val)[:, 1]
    # lr_preds_proba = lr_model.predict_proba(X_val)[:, 1]

    ensemble_model.fit(X_train, y_train)
    
    # Ensemble the predictions by averaging
    #ensemble_preds_proba = (xgb_preds_proba + et_preds_proba + svc_preds_proba + lr_preds_proba) / 4
    ensemble_preds_proba = ensemble_model.predict_proba(X_val)[:, 1]
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(ensemble_preds_proba)[-int(0.2 * len(ensemble_preds_proba)):]

    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices].values
    top_20_preds = (ensemble_preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate the number of correct class 1 predictions
    correct_class_1_predictions = sum(1 for true, pred in zip(top_20_true, top_20_preds) if true == 1 and pred == 1)
    
    # Calculate the number of features used
    num_features_used = Xloc.shape[1]

    #scaled score calculation
    customer_scaled = (correct_class_1_predictions / len(top_20_preds)) * 1000
    customer_gain = 10 * customer_scaled
    variable_cost = 200 * num_features_used
    custom_score = customer_gain - variable_cost
    
    # Compute the custom score
    #custom_score = (correct_class_1_predictions * 10) - (num_features_used * 200)
    
    report({"custom_score": custom_score})

# Define the search space using ConfigSpace
config_space = CS.ConfigurationSpace()

# Hyperparameters for XGBoost
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_eta', lower=0.01, upper=0.1, log=True))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_max_depth', lower=3, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_min_child_weight', lower=1, upper=5))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_subsample', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_colsample_bytree', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_lambda', lower=1e-3, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_alpha', lower=1e-3, upper=10.0, log=True))

# Hyperparameters for ExstraTrees
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_n_estimators', lower=50, upper=200))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_max_depth', lower=3, upper=20))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_split', lower=2, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_leaf', lower=1, upper=10))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_max_features', ['sqrt', 'log2']))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_bootstrap', [True, False]))

# Hyperparameters for SVC
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('svc_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('svc_kernel', ['linear', 'rbf', 'poly', 'sigmoid']))

# Hyperparameters for Logistic Regression
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('lr_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('lr_solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']))

config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_xgb', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_et', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_svc', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_lr', lower=0.0, upper=1.0))


config_space.add_hyperparameter(CSH.CategoricalHyperparameter('colset', ['[101, 102, 103, 105]', '[101, 102, 103]', '[101, 102, 105]', '[101, 103, 105]', '[102, 103, 105]', '[104, 102, 103, 105]', '[100, 101, 102, 103, 105]', '[100, 101, 102, 103]', '[100, 101, 102, 105]', '[100, 101, 103, 105]', '[100, 102, 103, 105]', '[100, 104, 102, 103, 105]', '[100, 101, 102, 103, 104, 105]']))

# Set up the BOHB search algorithm
bohb_search = TuneBOHB(config_space, metric="custom_score", mode="max")

# Set up the HyperBandForBOHB scheduler
bohb_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    metric="custom_score",
    mode="max"
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_ensemble,
    resources_per_trial={"cpu": 1, "gpu": 0},
    search_alg=bohb_search,
    scheduler=bohb_scheduler,
    num_samples=500,
    trial_dirname_creator=trial_dirname_creator,
    raise_on_failed_trial=False
)

# Get the best result
best_config = analysis.get_best_config(metric="custom_score", mode="max")
print("Best config: ", best_config)


2024-05-29 13:32:16,193	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-29 13:32:17,258	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-05-29 13:32:17,259	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-29 13:48:30
Running for:,00:16:12.85
Memory:,28.8/63.8 GiB

Trial name,status,loc,colset,et_bootstrap,et_max_depth,et_max_features,et_min_samples_leaf,et_min_samples_split,et_n_estimators,lr_C,lr_solver,svc_C,svc_kernel,weight_et,weight_lr,weight_svc,weight_xgb,xgb_alpha,xgb_colsample_bytree,xgb_eta,xgb_lambda,xgb_max_depth,xgb_min_child_weight,xgb_subsample,iter,total time (s),custom_score
train_ensemble_22c73578,TERMINATED,127.0.0.1:45528,"[100, 101, 102,_ab50",True,20,sqrt,9,8,53,8.52415,saga,0.136305,poly,0.30358,0.101725,0.933142,0.455196,0.0307395,0.791668,0.036376,0.00757243,4,4,0.781699,1,1.18523,6350
train_ensemble_78469c1b,TERMINATED,127.0.0.1:25896,"[100, 104, 102,_b640",False,3,sqrt,2,5,125,0.813241,newton-cg,0.795495,sigmoid,0.295847,0.816179,0.464023,0.0970291,2.79364,0.663552,0.0841544,0.0149942,8,1,0.641113,1,1.74932,6200
train_ensemble_68f7ed2e,TERMINATED,127.0.0.1:58224,"[101, 102, 103, 105]",True,14,sqrt,4,10,147,0.660762,saga,2.53515,sigmoid,0.326389,0.522018,0.220444,0.177316,0.321472,0.725514,0.0184404,0.0191618,7,5,0.546784,1,1.50565,6750
train_ensemble_c959d932,TERMINATED,127.0.0.1:55096,"[101, 103, 105]",False,17,sqrt,3,4,186,4.06335,saga,7.45293,poly,0.328199,0.810831,0.85698,0.0530776,0.501389,0.567672,0.0139128,0.111816,3,4,0.561262,1,2.57561,6250
train_ensemble_b45ae11e,TERMINATED,127.0.0.1:60380,"[100, 101, 102, 105]",True,14,sqrt,4,10,177,0.2716,lbfgs,0.149416,linear,0.574648,0.282162,0.631862,0.290142,0.023934,0.982069,0.0419023,0.00379067,4,3,0.994315,1,1.01527,6650
train_ensemble_5e54e7b4,TERMINATED,127.0.0.1:44788,"[101, 103, 105]",True,19,sqrt,4,5,143,0.14218,newton-cg,1.08709,poly,0.971819,0.910794,0.822786,0.0978232,2.26002,0.50812,0.0289461,0.00228276,7,4,0.556399,1,1.42206,6700
train_ensemble_16d339ce,TERMINATED,127.0.0.1:24112,"[100, 104, 102,_b640",True,3,sqrt,6,4,172,0.342368,newton-cg,0.309406,sigmoid,0.294357,0.907119,0.656663,0.000303533,0.0014971,0.989233,0.0127369,1.38376,7,4,0.501991,1,1.72032,5350
train_ensemble_545dced1,TERMINATED,127.0.0.1:59724,"[101, 102, 103]",True,11,log2,5,10,63,2.36611,liblinear,0.444431,sigmoid,0.349038,0.894511,0.468142,0.821014,0.0458808,0.700523,0.0375564,0.00385348,9,4,0.671514,1,1.441,6300
train_ensemble_ca724842,TERMINATED,127.0.0.1:7288,"[100, 101, 102,_b730",False,7,log2,5,2,101,6.8087,lbfgs,0.135105,poly,0.0621549,0.519986,0.294533,0.141582,0.219406,0.879579,0.0367655,0.928221,8,5,0.766542,1,1.08105,6550
train_ensemble_43532358,TERMINATED,127.0.0.1:62736,"[100, 101, 102, 105]",False,13,log2,6,6,200,0.462689,liblinear,0.223557,sigmoid,0.507006,0.965298,0.247665,0.195053,0.201981,0.984735,0.0125406,0.00489896,8,1,0.54192,1,1.7472,6700




[36m(train_ensemble pid=45528)[0m   y = column_or_1d(y, warn=True)
[36m(train_ensemble pid=45528)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Trial name,custom_score
train_ensemble_00fa124e,6850
train_ensemble_0127f3f9,6850
train_ensemble_012d3024,6600
train_ensemble_0194f48d,6800
train_ensemble_01b520b7,6300
train_ensemble_01fde203,6900
train_ensemble_0221e32e,6550
train_ensemble_0256ff27,6950
train_ensemble_02a437df,6750
train_ensemble_0317ea2c,6550


[36m(train_ensemble pid=55096)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=55096)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=24112)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=24112)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=56172)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=56172)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=44184)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=44184)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=54792)[0m 

Best config:  {'colset': '[100, 101, 102, 105]', 'et_bootstrap': True, 'et_max_depth': 7, 'et_max_features': 'sqrt', 'et_min_samples_leaf': 3, 'et_min_samples_split': 2, 'et_n_estimators': 191, 'lr_C': 0.2965858992012039, 'lr_solver': 'liblinear', 'svc_C': 0.14666802003882573, 'svc_kernel': 'linear', 'weight_et': 0.779258883724107, 'weight_lr': 0.5947871845865511, 'weight_svc': 0.2843918757107351, 'weight_xgb': 0.5108061358970724, 'xgb_alpha': 0.13534525357173383, 'xgb_colsample_bytree': 0.5327687947981937, 'xgb_eta': 0.025506248584380223, 'xgb_lambda': 0.06358084547274466, 'xgb_max_depth': 6, 'xgb_min_child_weight': 4, 'xgb_subsample': 0.6996535735845992}


In [3]:
df=analysis.dataframe()
df.to_csv('ensemble1-raytune-bohb.csv')

In [4]:
df.sort_values(by='custom_score', ascending=False, inplace=True)
print(df.head())

     custom_score   timestamp checkpoint_dir_name   done  training_iteration  \
435        7800.0  1716983186                None  False                   1   
472        7600.0  1716983258                None  False                   1   
225        7500.0  1716982786                None  False                   1   
139        7400.0  1716982626                None  False                   1   
311        7350.0  1716982949                None  False                   1   

     trial_id                 date  time_this_iter_s  time_total_s    pid  \
435  33c3d2ef  2024-05-29_13-46-26          1.002002      1.002002   9324   
472  530c9293  2024-05-29_13-47-38          1.301511      1.301511  37732   
225  81d84b57  2024-05-29_13-39-46          1.583009      1.583009  12976   
139  60ceaa69  2024-05-29_13-37-06          1.776253      1.776253  39040   
311  b242716a  2024-05-29_13-42-29          1.023832      1.023832  14988   

     ... config/weight_svc config/weight_xgb  config/xgb