In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [2]:
import ray
from ray import tune
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import ast
from scoring import scoring_function

# Assuming your data is in X and y dataframes

def train_naive_bayes(config):
    vars = ast.literal_eval(config['colset'])
    Xloc = X[vars]
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xloc, y, test_size=0.2, random_state=42)
    
    # Initialize the model
    model = GaussianNB()
    
    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities on the validation set
    preds_proba = model.predict_proba(X_val)[:, 1]
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(preds_proba)[-int(0.2 * len(preds_proba)):]

    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices].values
    top_20_preds = (preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate the number of correct class 1 predictions
    correct_class_1_predictions = sum(1 for true, pred in zip(top_20_true, top_20_preds) if true == 1 and pred == 1)
    
    # Calculate the number of features used
    num_features_used = Xloc.shape[1]

    # Scaled score calculation
    customer_scaled = (correct_class_1_predictions / len(top_20_preds)) * 1000
    customer_gain = 10 * customer_scaled
    variable_cost = 200 * num_features_used
    custom_score = customer_gain - variable_cost
    
    # Report the custom score
    report({"custom_score": custom_score})

# Define the search space using ConfigSpace
config_space = CS.ConfigurationSpace()

# Since Naive Bayes doesn't have many hyperparameters, we only consider feature subsets
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('colset', [
    '[101, 102, 103, 105]', '[101, 102, 103]', '[101, 102, 105]', '[101, 103, 105]', 
    '[102, 103, 105]', '[104, 102, 103, 105]', '[100, 101, 102, 103, 105]', '[100, 101, 102, 103]', 
    '[100, 101, 102, 105]', '[100, 101, 103, 105]', '[100, 102, 103, 105]', '[100, 104, 102, 103, 105]', 
    '[100, 101, 102, 103, 104, 105]'
]))

# Set up the BOHB search algorithm
bohb_search = TuneBOHB(config_space, metric="custom_score", mode="max")

# Set up the HyperBandForBOHB scheduler
bohb_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    metric="custom_score",
    mode="max"
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_naive_bayes,
    resources_per_trial={"cpu": 1, "gpu": 1},
    search_alg=bohb_search,
    scheduler=bohb_scheduler,
    num_samples=13,
    trial_dirname_creator=trial_dirname_creator,
    raise_on_failed_trial=False
)

# Get the best result
best_config = analysis.get_best_config(metric="custom_score", mode="max")
print("Best config: ", best_config)


2024-06-02 21:41:52,370	INFO worker.py:1749 -- Started a local Ray instance.
2024-06-02 21:41:53,804	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-06-02 21:41:53,805	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-06-02 21:42:48
Running for:,00:00:54.58
Memory:,26.8/63.8 GiB

Trial name,status,loc,colset,iter,total time (s),custom_score
train_naive_bayes_6ab1aaac,TERMINATED,127.0.0.1:57368,"[100, 102, 103, 105]",1,0.00800228,7200
train_naive_bayes_805a589e,TERMINATED,127.0.0.1:55668,"[102, 103, 105]",1,0.0049994,7500
train_naive_bayes_3b281a94,TERMINATED,127.0.0.1:26016,"[101, 102, 105]",1,0.00499916,7200
train_naive_bayes_3fdd1dc6,TERMINATED,127.0.0.1:57888,"[101, 103, 105]",1,0.00499964,7250
train_naive_bayes_e1a4f94a,TERMINATED,127.0.0.1:35552,"[101, 103, 105]",1,0.00499988,7250
train_naive_bayes_eb3a39bf,TERMINATED,127.0.0.1:25032,"[101, 102, 105]",1,0.0049994,7200
train_naive_bayes_cad1a0e1,TERMINATED,127.0.0.1:27104,"[101, 103, 105]",1,0.00499964,7250
train_naive_bayes_8690c853,TERMINATED,127.0.0.1:39072,"[101, 102, 105]",1,0.00499892,7200
train_naive_bayes_bfa20203,TERMINATED,127.0.0.1:25224,"[101, 103, 105]",1,0.00900054,7250
train_naive_bayes_8b8815a7,TERMINATED,127.0.0.1:32052,"[101, 102, 105]",1,0.00700116,7200




[36m(train_naive_bayes pid=56380)[0m   y = column_or_1d(y, warn=True)


Trial name,custom_score
train_naive_bayes_3b281a94,7200
train_naive_bayes_3fdd1dc6,7250
train_naive_bayes_6302103d,7500
train_naive_bayes_6ab1aaac,7200
train_naive_bayes_7c19e5e6,7150
train_naive_bayes_805a589e,7500
train_naive_bayes_8690c853,7200
train_naive_bayes_8b8815a7,7200
train_naive_bayes_bfa20203,7250
train_naive_bayes_cad1a0e1,7250


[36m(train_naive_bayes pid=40352)[0m   y = column_or_1d(y, warn=True)[32m [repeated 2x across cluster][0m
[36m(train_naive_bayes pid=40660)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_naive_bayes pid=25804)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
[36m(train_naive_bayes pid=43988)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
2024-06-02 21:42:24,222	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
[36m(train_naive_bayes pid=55668)[0m   y = column_or_1d(y, warn=True)[32m [repeated 3x across cluster][0m
2024-06-02 21:42:26,188	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-06-02 21:42:28,156	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-06-02 21:42:30,168	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
[36m(train_naive_

Best config:  {'colset': '[102, 103, 105]'}


In [3]:
df=analysis.dataframe()
df.to_csv('nb-raytune_bohb.csv')

In [4]:
df.sort_values(by='custom_score', ascending=False, inplace=True)
print(df.head())

    custom_score   timestamp checkpoint_dir_name   done  training_iteration  \
1         7500.0  1717357346                None  False                   1   
12        7500.0  1717357368                None  False                   1   
3         7250.0  1717357350                None  False                   1   
4         7250.0  1717357352                None  False                   1   
6         7250.0  1717357356                None  False                   1   

    trial_id                 date  time_this_iter_s  time_total_s    pid  \
1   805a589e  2024-06-02_21-42-26          0.004999      0.004999  55668   
12  6302103d  2024-06-02_21-42-48          0.005999      0.005999  12840   
3   3fdd1dc6  2024-06-02_21-42-30          0.005000      0.005000  57888   
4   e1a4f94a  2024-06-02_21-42-32          0.005000      0.005000  35552   
6   cad1a0e1  2024-06-02_21-42-36          0.005000      0.005000  27104   

           hostname    node_ip  time_since_restore  iterations_since