In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [2]:
import ray
from ray import tune
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import ast

# Assuming your data is in X and y dataframes

def train_ensemble(config):
    vars = ast.literal_eval(config['colset'])
    Xloc = X[vars]
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xloc, y, test_size=0.2, random_state=42)
    
    # Initialize the models with the given hyperparameters
    xgb_model = XGBClassifier(
        eta=config["xgb_eta"],
        max_depth=config["xgb_max_depth"],
        min_child_weight=config["xgb_min_child_weight"],
        subsample=config["xgb_subsample"],
        colsample_bytree=config["xgb_colsample_bytree"],
        reg_lambda=config["xgb_lambda"],
        reg_alpha=config["xgb_alpha"],
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        verbosity=0
    )

    # et_model = ExtraTreesClassifier(
    #     n_estimators=config["et_n_estimators"],
    #     max_depth=config["et_max_depth"],
    #     min_samples_split=config["et_min_samples_split"],
    #     min_samples_leaf=config["et_min_samples_leaf"],
    #     max_features=config["et_max_features"],
    #     bootstrap=config["et_bootstrap"]
    # )
    
    svc_model = SVC(
        C=config["svc_C"],
        kernel=config["svc_kernel"],
        probability=True
    )

    lr_model = LogisticRegression(
        C=config["lr_C"],
        solver=config["lr_solver"],
        max_iter=1000
    )

    nb_model = GaussianNB()
    
    ensemble_model = VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            # ('et', et_model),
            ('svc', svc_model),
            ('lr', lr_model),
            ('nb', nb_model)
        ],
        voting='soft',
        weights=[
            config["weight_xgb"],
            # config["weight_et"],
            config["weight_svc"],
            config["weight_lr"],
            config["weight_nb"]
        ]
    )
    
    # Train the ensemble model
    ensemble_model.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    ensemble_preds_proba = ensemble_model.predict_proba(X_val)[:, 1]
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(ensemble_preds_proba)[-int(0.2 * len(ensemble_preds_proba)):]

    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices].values
    top_20_preds = (ensemble_preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate the number of correct class 1 predictions
    correct_class_1_predictions = sum(1 for true, pred in zip(top_20_true, top_20_preds) if true == 1 and pred == 1)
    
    # Calculate the number of features used
    num_features_used = Xloc.shape[1]

    # Scaled score calculation
    customer_scaled = (correct_class_1_predictions / len(top_20_preds)) * 1000
    customer_gain = 10 * customer_scaled
    variable_cost = 200 * num_features_used
    custom_score = customer_gain - variable_cost
    
    # Report the custom score
    report({"custom_score": custom_score})

# Define the search space using ConfigSpace
config_space = CS.ConfigurationSpace()

# Hyperparameters for XGBoost
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_eta', lower=0.01, upper=0.1, log=True))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_max_depth', lower=3, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('xgb_min_child_weight', lower=1, upper=5))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_subsample', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_colsample_bytree', lower=0.5, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_lambda', lower=1e-3, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('xgb_alpha', lower=1e-3, upper=10.0, log=True))

# Hyperparameters for ExtraTrees
# config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_n_estimators', lower=50, upper=200))
# config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_max_depth', lower=3, upper=20))
# config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_split', lower=2, upper=10))
# config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('et_min_samples_leaf', lower=1, upper=10))
# config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_max_features', ['sqrt', 'log2']))
# config_space.add_hyperparameter(CSH.CategoricalHyperparameter('et_bootstrap', [True, False]))

# Hyperparameters for SVC
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('svc_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('svc_kernel', ['linear', 'rbf', 'poly', 'sigmoid']))

# Hyperparameters for Logistic Regression
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('lr_C', lower=0.1, upper=10.0, log=True))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('lr_solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']))

# Weights for the ensemble
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_xgb', lower=0.0, upper=1.0))
#config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_et', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_svc', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_lr', lower=0.0, upper=1.0))
config_space.add_hyperparameter(CSH.UniformFloatHyperparameter('weight_nb', lower=0.0, upper=1.0))

# Hyperparameters for feature sets
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('colset', [
    '[101, 102, 103, 105]', '[101, 102, 103]', '[101, 102, 105]', '[101, 103, 105]', 
    '[102, 103, 105]', '[104, 102, 103, 105]', '[100, 101, 102, 103, 105]', '[100, 101, 102, 103]', 
    '[100, 101, 102, 105]', '[100, 101, 103, 105]', '[100, 102, 103, 105]', '[100, 104, 102, 103, 105]', 
    '[100, 101, 102, 103, 104, 105]'
]))

# Set up the BOHB search algorithm
bohb_search = TuneBOHB(config_space, metric="custom_score", mode="max")

# Set up the HyperBandForBOHB scheduler
bohb_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    metric="custom_score",
    mode="max"
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_ensemble,
    resources_per_trial={"cpu": 1, "gpu": 0},
    search_alg=bohb_search,
    scheduler=bohb_scheduler,
    num_samples=500,
    trial_dirname_creator=trial_dirname_creator,
    raise_on_failed_trial=False
)

# Get the best result
best_config = analysis.get_best_config(metric="custom_score", mode="max")
print("Best config: ", best_config)


2024-06-02 22:00:44,606	INFO worker.py:1749 -- Started a local Ray instance.
2024-06-02 22:00:45,597	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-06-02 22:00:45,598	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-06-02 22:16:41
Running for:,00:15:55.24
Memory:,24.9/63.8 GiB

Trial name,status,loc,colset,lr_C,lr_solver,svc_C,svc_kernel,weight_lr,weight_nb,weight_svc,weight_xgb,xgb_alpha,xgb_colsample_bytree,xgb_eta,xgb_lambda,xgb_max_depth,xgb_min_child_weight,xgb_subsample,iter,total time (s),custom_score
train_ensemble_e0f49734,TERMINATED,127.0.0.1:14680,"[100, 104, 102,_69c0",0.484938,lbfgs,0.126215,linear,0.112939,0.732059,0.973503,0.09909,0.830174,0.525952,0.0455705,0.0547733,8,1,0.595487,1,0.921642,6850
train_ensemble_57e1a450,TERMINATED,127.0.0.1:21328,"[100, 101, 102,_2380",0.982341,saga,0.392889,rbf,0.945115,0.366296,0.740207,0.64266,0.341481,0.539108,0.0797305,0.021864,3,2,0.721785,1,1.19688,6850
train_ensemble_7ea2a20b,TERMINATED,127.0.0.1:56116,"[104, 102, 103, 105]",0.613769,sag,4.148,sigmoid,0.738068,0.766039,0.939035,0.285325,0.00317232,0.973114,0.0540553,0.85506,10,3,0.64,1,1.29687,7000
train_ensemble_a3e83ad6,TERMINATED,127.0.0.1:25484,"[100, 101, 102,_2380",5.28949,liblinear,3.79535,poly,0.589578,0.124927,0.504864,0.488574,0.00247231,0.742047,0.0852046,0.00234956,9,3,0.53975,1,1.83358,6950
train_ensemble_ed134764,TERMINATED,127.0.0.1:25328,"[100, 101, 103, 105]",5.66406,newton-cg,2.33487,poly,0.545024,0.366589,0.361278,0.570056,2.63743,0.532425,0.0100091,0.0098339,5,4,0.796597,1,1.49245,6900
train_ensemble_6e30c5cd,TERMINATED,127.0.0.1:11848,"[100, 101, 102, 105]",0.208937,newton-cg,1.50309,linear,0.108499,0.234078,0.387825,0.714539,0.382134,0.746916,0.0234621,0.489502,6,5,0.82952,1,1.06294,7000
train_ensemble_0d6f8b39,TERMINATED,127.0.0.1:45156,"[100, 101, 102,_2380",0.153801,saga,3.65727,rbf,0.55875,0.588381,0.93011,0.280332,1.82316,0.926599,0.0196808,0.370939,10,3,0.538594,1,1.43899,6900
train_ensemble_d7ebc2d5,TERMINATED,127.0.0.1:51924,"[101, 103, 105]",3.08311,newton-cg,3.79163,poly,0.498102,0.0267205,0.693956,0.73079,0.0808021,0.557772,0.0119964,0.0269301,4,3,0.828887,1,1.8548,6650
train_ensemble_b726dbe0,TERMINATED,127.0.0.1:15660,"[100, 101, 102, 103]",0.207601,sag,4.59242,linear,0.339862,0.577387,0.0237844,0.952145,0.486682,0.899837,0.076932,0.484227,5,2,0.884707,1,1.36868,6950
train_ensemble_4f5d1986,TERMINATED,127.0.0.1:34692,"[100, 101, 103, 105]",0.347062,lbfgs,0.135547,rbf,0.100085,0.948731,0.800531,0.580838,0.0353018,0.675184,0.0961852,0.00326502,7,1,0.637394,1,1.25488,6750




[36m(train_ensemble pid=14680)[0m   y = column_or_1d(y, warn=True)
[36m(train_ensemble pid=14680)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Trial name,custom_score
train_ensemble_01a5aca9,6750
train_ensemble_0235a1b9,7200
train_ensemble_025806d4,7400
train_ensemble_02a82daa,6800
train_ensemble_03848274,6950
train_ensemble_0445b470,6750
train_ensemble_04e56d35,7300
train_ensemble_0530be7b,7250
train_ensemble_05b43db1,6650
train_ensemble_0616a3cd,6700


[36m(train_ensemble pid=25484)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=25484)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=45156)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=45156)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=34692)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=34692)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=40440)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=40440)[0m   y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_ensemble pid=48880)[0m 

Best config:  {'colset': '[102, 103, 105]', 'lr_C': 0.11571707559506507, 'lr_solver': 'newton-cg', 'svc_C': 0.1696684360739591, 'svc_kernel': 'rbf', 'weight_lr': 0.06140413456556626, 'weight_nb': 0.07067428735903525, 'weight_svc': 0.9183199868143154, 'weight_xgb': 0.2553230191779601, 'xgb_alpha': 0.2791474323818901, 'xgb_colsample_bytree': 0.8182440566054697, 'xgb_eta': 0.0201501034795639, 'xgb_lambda': 0.027568612453583018, 'xgb_max_depth': 9, 'xgb_min_child_weight': 1, 'xgb_subsample': 0.566849382645442}


In [3]:
df=analysis.dataframe()
df.to_csv('ensemble4-raytune-bohb.csv')

In [4]:
df.sort_values(by='custom_score', ascending=False, inplace=True)
print(df.head())

     custom_score   timestamp checkpoint_dir_name   done  training_iteration  \
16         7500.0  1717358604                None  False                   1   
191        7500.0  1717358817                None  False                   1   
153        7500.0  1717358749                None  False                   1   
235        7500.0  1717358895                None  False                   1   
221        7500.0  1717358870                None  False                   1   

     trial_id                 date  time_this_iter_s  time_total_s    pid  \
16   21071ccf  2024-06-02_22-03-24          1.278706      1.278706  23728   
191  b6084f18  2024-06-02_22-06-57          1.283568      1.283568  44072   
153  f2b25af6  2024-06-02_22-05-49          1.242032      1.242032  64548   
235  89f5eaa1  2024-06-02_22-08-15          0.923303      0.923303  24872   
221  0bf09362  2024-06-02_22-07-50          0.932133      0.932133  52520   

     ... config/weight_svc config/weight_xgb  config/xgb