In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [3]:
import ray
from ray import tune
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import ast
from scoring import scoring_function

# Assuming your data is in X and y dataframes

def train_random_forest(config):
    vars = ast.literal_eval(config['colset'])
    Xloc = X[vars]
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xloc, y, test_size=0.2, random_state=42)
    
    # Initialize the model with the given hyperparameters
    model = RandomForestClassifier(
        n_estimators=config["n_estimators"],
        max_depth=config["max_depth"],
        min_samples_split=config["min_samples_split"],
        min_samples_leaf=config["min_samples_leaf"],
        max_features=config["max_features"],
        bootstrap=config["bootstrap"]
    )
    
    # Train the model
    model.fit(X_train, y_train)

    #custom_score=scoring_function(model, X_val, y_val)
    
    # Predict probabilities on the validation set
    preds_proba = model.predict_proba(X_val)[:, 1]
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(preds_proba)[-int(0.2 * len(preds_proba)):]

    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices].values
    top_20_preds = (preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate the number of correct class 1 predictions
    correct_class_1_predictions = sum(1 for true, pred in zip(top_20_true, top_20_preds) if true == 1 and pred == 1)
    
    # Calculate the number of features used
    num_features_used = Xloc.shape[1]

    #scaled score calculation
    customer_scaled = (correct_class_1_predictions / len(top_20_preds)) * 1000
    customer_gain = 10 * customer_scaled
    variable_cost = 200 * num_features_used
    custom_score = customer_gain - variable_cost
    
    # Compute the custom score
    #custom_score = (correct_class_1_predictions * 10) - (num_features_used * 200)
    
    report({"custom_score": custom_score})

# Define the search space using ConfigSpace
config_space = CS.ConfigurationSpace()

config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('n_estimators', lower=50, upper=200))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('max_depth', lower=3, upper=20))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('min_samples_split', lower=2, upper=10))
config_space.add_hyperparameter(CSH.UniformIntegerHyperparameter('min_samples_leaf', lower=1, upper=10))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('colset', ['[101, 102, 103, 105]', '[101, 102, 103]', '[101, 102, 105]', '[101, 103, 105]', '[102, 103, 105]', '[104, 102, 103, 105]', '[100, 101, 102, 103, 105]', '[100, 101, 102, 103]', '[100, 101, 102, 105]', '[100, 101, 103, 105]', '[100, 102, 103, 105]', '[100, 104, 102, 103, 105]', '[100, 101, 102, 103, 104, 105]']))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('max_features', ['sqrt', 'log2']))
config_space.add_hyperparameter(CSH.CategoricalHyperparameter('bootstrap', [True, False]))

# Set up the BOHB search algorithm
bohb_search = TuneBOHB(config_space, metric="custom_score", mode="max")

# Set up the HyperBandForBOHB scheduler
bohb_scheduler = HyperBandForBOHB(
    time_attr="training_iteration",
    metric="custom_score",
    mode="max"
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_random_forest,
    resources_per_trial={"cpu": 1, "gpu": 1},
    search_alg=bohb_search,
    scheduler=bohb_scheduler,
    num_samples=50,
    trial_dirname_creator=trial_dirname_creator,
    raise_on_failed_trial=False
)

# Get the best result
best_config = analysis.get_best_config(metric="custom_score", mode="max")
print("Best config: ", best_config)


2024-05-29 12:59:31,364	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-29 13:02:28
Running for:,00:02:57.47
Memory:,23.4/63.8 GiB

Trial name,status,loc,bootstrap,colset,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,iter,total time (s),custom_score
train_random_forest_35b804a7,TERMINATED,127.0.0.1:8556,True,"[101, 102, 105]",7,log2,10,7,122,1,0.250999,6950
train_random_forest_c40c3d30,TERMINATED,127.0.0.1:61360,True,"[100, 101, 102, 105]",15,sqrt,10,6,122,1,0.610913,6750
train_random_forest_7678a244,TERMINATED,127.0.0.1:42336,True,"[104, 102, 103, 105]",12,log2,7,9,137,1,0.64815,6700
train_random_forest_36330438,TERMINATED,127.0.0.1:54176,False,"[100, 102, 103, 105]",15,log2,7,9,166,1,1.22141,6550
train_random_forest_a1e1e1e3,TERMINATED,127.0.0.1:63416,True,"[101, 102, 103, 105]",4,sqrt,1,2,114,1,0.267999,6200
train_random_forest_9726e0f0,TERMINATED,127.0.0.1:28932,True,"[100, 101, 102, 103]",17,sqrt,10,8,92,1,0.493,6450
train_random_forest_11811e52,TERMINATED,127.0.0.1:50876,False,"[104, 102, 103, 105]",16,sqrt,4,10,70,1,0.559049,6450
train_random_forest_dd3f15b0,TERMINATED,127.0.0.1:32032,False,"[100, 101, 103, 105]",7,log2,7,8,128,1,0.579866,6250
train_random_forest_8f692ba3,TERMINATED,127.0.0.1:15572,True,"[104, 102, 103, 105]",8,log2,9,6,157,1,0.578975,6550
train_random_forest_359adbbc,TERMINATED,127.0.0.1:30428,False,"[101, 102, 105]",12,sqrt,2,10,97,1,0.373076,6600


Trial name,custom_score
train_random_forest_04ebb20d,6750
train_random_forest_0620afa0,6600
train_random_forest_0c3ba4d7,6700
train_random_forest_11811e52,6450
train_random_forest_128a751d,6650
train_random_forest_14455f44,6450
train_random_forest_1a84bcef,6650
train_random_forest_1c0c21fb,6900
train_random_forest_24a24f2a,6700
train_random_forest_31bfd7ff,6450


2024-05-29 13:01:31,157	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:33,570	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:36,598	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:39,224	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:41,571	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:44,522	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:46,528	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:48,458	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:50,510	INFO hyperband.py:543 -- Restoring from a previous point in time. Previous=1; Now=1
2024-05-29 13:01:52,493	INFO

Best config:  {'bootstrap': True, 'colset': '[104, 102, 103, 105]', 'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 125}


In [None]:
df=analysis.dataframe()
df.to_csv('rf-raytune_bohb.csv')

In [None]:
df.sort_values(by='custom_score', ascending=False, inplace=True)
print(df.head())

    custom_score  correct_class_1_predictions  num_features_used   timestamp  \
49           920                          152                  3  1716931033   
0            910                          151                  3  1716930968   
2            850                          145                  3  1716930973   
44           850                          145                  3  1716931031   
13           840                          144                  3  1716930983   

   checkpoint_dir_name   done  training_iteration  trial_id  \
49                None  False                   1  c94a92fc   
0                 None  False                   1  06c1d9e5   
2                 None  False                   1  a9fb2965   
44                None  False                   1  0ec8b142   
13                None  False                   1  261ee0a7   

                   date  time_this_iter_s  ...  time_since_restore  \
49  2024-05-28_23-17-13          0.469959  ...            0.469959   
