In [2]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [3]:
vars=[100, 101, 102, 103, 104, 105, 6]

In [4]:
X = X[vars]

In [9]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
import numpy as np

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

def train_xgboost(config):
    # Convert dataframes to DMatrix
    train_data = xgb.DMatrix(X_train, label=y_train)
    val_data = xgb.DMatrix(X_val, label=y_val)
    
    # Train the model
    results = {}
    bst = xgb.train(config, train_data, evals=[(val_data, "eval")], evals_result=results, verbose_eval=False)
    
    # Predict on the validation set
    preds = bst.predict(val_data)
    preds = np.round(preds)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, preds)
    report({"accuracy":accuracy})

# Define the search space
search_space = {
    "objective": "binary:logistic",
    "eval_metric": "error",
    "eta": tune.loguniform(0.01, 0.1),
    "max_depth": tune.randint(3, 10),
    "min_child_weight": tune.randint(1, 6),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    "lambda": tune.loguniform(1e-3, 10.0),
    "alpha": tune.loguniform(1e-3, 10.0)
}

# Set up the scheduler
scheduler = ASHAScheduler(
    metric="accuracy",
    mode="max",
    max_t=10,
    grace_period=1,
    reduction_factor=2
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_xgboost,
    resources_per_trial={"cpu": 1, "gpu": 1},
    config=search_space,
    num_samples=200,
    scheduler=scheduler,
    trial_dirname_creator=trial_dirname_creator
)

# Get the best result
best_config = analysis.get_best_config(metric="accuracy", mode="max")
print("Best config: ", best_config)


2024-05-25 13:15:09,907	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-25 13:16:50
Running for:,00:01:40.61
Memory:,22.6/63.8 GiB

Trial name,status,loc,alpha,colsample_bytree,eta,lambda,max_depth,min_child_weight,subsample,iter,total time (s),accuracy
train_xgboost_0c7c3_00000,TERMINATED,127.0.0.1:36240,5.34462,0.758503,0.0470866,7.2975,8,1,0.724192,1,0.0389998,0.685
train_xgboost_0c7c3_00001,TERMINATED,127.0.0.1:46984,0.00288563,0.702708,0.0389519,1.59247,9,2,0.870108,1,0.0429995,0.68
train_xgboost_0c7c3_00002,TERMINATED,127.0.0.1:46440,9.03554,0.761735,0.0231802,0.0016519,9,2,0.508966,1,0.0389991,0.69
train_xgboost_0c7c3_00003,TERMINATED,127.0.0.1:43852,0.653539,0.892972,0.0178262,0.0312002,8,3,0.690486,1,0.0420032,0.667
train_xgboost_0c7c3_00004,TERMINATED,127.0.0.1:36636,8.68414,0.623835,0.0241073,0.00180085,8,5,0.988268,1,0.0340044,0.679
train_xgboost_0c7c3_00005,TERMINATED,127.0.0.1:45312,0.00943294,0.825944,0.0242613,1.72462,3,5,0.655561,1,0.0309999,0.631
train_xgboost_0c7c3_00006,TERMINATED,127.0.0.1:39260,0.049192,0.515166,0.0256684,0.0852363,8,3,0.979767,1,0.0419991,0.666
train_xgboost_0c7c3_00007,TERMINATED,127.0.0.1:34928,0.0391877,0.785383,0.05845,4.58487,4,1,0.813896,1,0.033999,0.658
train_xgboost_0c7c3_00008,TERMINATED,127.0.0.1:39472,0.189301,0.900466,0.043472,0.0145872,7,4,0.930149,1,0.0429988,0.669
train_xgboost_0c7c3_00009,TERMINATED,127.0.0.1:30904,0.194007,0.673372,0.0395801,0.0501882,7,5,0.972356,1,0.0380006,0.687


Trial name,accuracy
train_xgboost_0c7c3_00000,0.685
train_xgboost_0c7c3_00001,0.68
train_xgboost_0c7c3_00002,0.69
train_xgboost_0c7c3_00003,0.667
train_xgboost_0c7c3_00004,0.679
train_xgboost_0c7c3_00005,0.631
train_xgboost_0c7c3_00006,0.666
train_xgboost_0c7c3_00007,0.658
train_xgboost_0c7c3_00008,0.669
train_xgboost_0c7c3_00009,0.687


2024-05-25 13:16:50,526	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Mieszko/ray_results/train_xgboost_2024-05-25_13-15-09' in 0.0320s.
2024-05-25 13:16:50,544	INFO tune.py:1039 -- Total run time: 100.64 seconds (100.58 seconds for the tuning loop).


Best config:  {'objective': 'binary:logistic', 'eval_metric': 'error', 'eta': 0.047124191913903775, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7427863998699067, 'colsample_bytree': 0.8228145162828944, 'lambda': 2.5911888773797593, 'alpha': 0.006584824935651421}


In [13]:
df=analysis.dataframe()

In [14]:
df.to_csv('tune_results.csv')

In [15]:
df.head()

Unnamed: 0,accuracy,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,...,config/objective,config/eval_metric,config/eta,config/max_depth,config/min_child_weight,config/subsample,config/colsample_bytree,config/lambda,config/alpha,logdir
0,0.685,1716635711,,False,1,0c7c3_00000,2024-05-25_13-15-11,0.039,0.039,36240,...,binary:logistic,error,0.047087,8,1,0.724192,0.758503,7.297498,5.344616,0c7c3_00000
1,0.68,1716635713,,True,1,0c7c3_00001,2024-05-25_13-15-13,0.043,0.043,46984,...,binary:logistic,error,0.038952,9,2,0.870108,0.702708,1.592473,0.002886,0c7c3_00001
2,0.69,1716635715,,False,1,0c7c3_00002,2024-05-25_13-15-15,0.038999,0.038999,46440,...,binary:logistic,error,0.02318,9,2,0.508966,0.761735,0.001652,9.035537,0c7c3_00002
3,0.667,1716635717,,True,1,0c7c3_00003,2024-05-25_13-15-17,0.042003,0.042003,43852,...,binary:logistic,error,0.017826,8,3,0.690486,0.892972,0.0312,0.653539,0c7c3_00003
4,0.679,1716635719,,True,1,0c7c3_00004,2024-05-25_13-15-19,0.034004,0.034004,36636,...,binary:logistic,error,0.024107,8,5,0.988268,0.623835,0.001801,8.684144,0c7c3_00004


In [16]:
df.sort_values('accuracy', ascending=False).head()

Unnamed: 0,accuracy,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,...,config/objective,config/eval_metric,config/eta,config/max_depth,config/min_child_weight,config/subsample,config/colsample_bytree,config/lambda,config/alpha,logdir
43,0.706,1716635798,,False,1,0c7c3_00043,2024-05-25_13-16-38,0.041999,0.041999,38768,...,binary:logistic,error,0.047124,9,5,0.742786,0.822815,2.591189,0.006585,0c7c3_00043
27,0.7,1716635765,,False,1,0c7c3_00027,2024-05-25_13-16-05,0.042,0.042,43652,...,binary:logistic,error,0.021559,7,1,0.890943,0.631236,0.027379,0.004428,0c7c3_00027
46,0.692,1716635804,,False,1,0c7c3_00046,2024-05-25_13-16-44,0.044561,0.044561,35312,...,binary:logistic,error,0.053248,9,3,0.763625,0.72551,0.002346,1.494335,0c7c3_00046
31,0.691,1716635774,,False,1,0c7c3_00031,2024-05-25_13-16-14,0.041,0.041,41992,...,binary:logistic,error,0.031388,8,5,0.985262,0.572583,0.01294,0.011627,0c7c3_00031
2,0.69,1716635715,,False,1,0c7c3_00002,2024-05-25_13-15-15,0.038999,0.038999,46440,...,binary:logistic,error,0.02318,9,2,0.508966,0.761735,0.001652,9.035537,0c7c3_00002


In [8]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
import numpy as np

# Assuming your data is in X and y dataframes
# Split data into training and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def train_xgboost(config):
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    # Convert dataframes to DMatrix
    train_data = xgb.DMatrix(X_train, label=y_train)
    val_data = xgb.DMatrix(X_val, label=y_val)
    
    # Train the model
    results = {}
    bst = xgb.train(config, train_data, evals=[(val_data, "eval")], evals_result=results, verbose_eval=False)
    
    # Predict on the validation set
    preds_proba = bst.predict(val_data)
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(preds_proba)[-int(0.2 * len(preds_proba)):]
    
    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices]
    top_20_preds = np.round(preds_proba[top_20_percent_indices])
    
    # Calculate accuracy for the top 20%
    top_20_accuracy = accuracy_score(top_20_true, top_20_preds)
    report({"top_20_accuracy":top_20_accuracy})

# Define the search space
search_space = {
    "objective": "binary:logistic",
    "eval_metric": "error",
    "eta": tune.loguniform(0.01, 0.1),
    "max_depth": tune.randint(3, 10),
    "min_child_weight": tune.randint(1, 6),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    "lambda": tune.loguniform(1e-3, 10.0),
    "alpha": tune.loguniform(1e-3, 10.0)
}

# Set up the scheduler
scheduler = ASHAScheduler(
    metric="top_20_accuracy",
    mode="max",
    max_t=10,
    grace_period=1,
    reduction_factor=2
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_xgboost,
    resources_per_trial={"cpu": 1, "gpu": 0},
    config=search_space,
    num_samples=250,
    scheduler=scheduler,
    trial_dirname_creator=trial_dirname_creator
)

# Get the best result
best_config = analysis.get_best_config(metric="top_20_accuracy", mode="max")
print("Best config: ", best_config)

2024-05-25 21:27:36,534	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-25 21:28:13
Running for:,00:00:37.18
Memory:,20.5/63.8 GiB

Trial name,status,loc,alpha,colsample_bytree,eta,lambda,max_depth,min_child_weight,subsample,iter,total time (s),top_20_accuracy
train_xgboost_d7a59_00000,TERMINATED,127.0.0.1:40996,0.003241,0.862589,0.0352807,0.0601774,6,4,0.787483,1,0.0550005,0.725
train_xgboost_d7a59_00001,TERMINATED,127.0.0.1:34852,0.0155165,0.986915,0.0583275,0.0642778,8,1,0.689091,1,0.0710022,0.72
train_xgboost_d7a59_00002,TERMINATED,127.0.0.1:25028,3.80711,0.913916,0.0519656,0.00105658,5,4,0.834375,1,0.0530014,0.715
train_xgboost_d7a59_00003,TERMINATED,127.0.0.1:29632,0.00573143,0.634214,0.0313209,3.3533,4,1,0.82106,1,0.0539987,0.715
train_xgboost_d7a59_00004,TERMINATED,127.0.0.1:11096,2.56897,0.892108,0.019976,5.71581,5,1,0.709859,1,0.0470004,0.705
train_xgboost_d7a59_00005,TERMINATED,127.0.0.1:46188,0.873408,0.937831,0.0186323,0.586813,6,2,0.658966,1,0.0480001,0.675
train_xgboost_d7a59_00006,TERMINATED,127.0.0.1:38276,3.66584,0.535365,0.0139796,0.0155391,7,3,0.711526,1,0.045001,0.685
train_xgboost_d7a59_00007,TERMINATED,127.0.0.1:16416,0.00509176,0.577653,0.0213127,0.00459568,5,1,0.649358,1,0.0639997,0.75
train_xgboost_d7a59_00008,TERMINATED,127.0.0.1:26240,4.48904,0.540475,0.0105324,0.325272,5,1,0.94124,1,0.052999,0.72
train_xgboost_d7a59_00009,TERMINATED,127.0.0.1:25124,0.133084,0.942687,0.0377129,0.168972,7,1,0.715267,1,0.0689993,0.77


Trial name,top_20_accuracy
train_xgboost_d7a59_00000,0.725
train_xgboost_d7a59_00001,0.72
train_xgboost_d7a59_00002,0.715
train_xgboost_d7a59_00003,0.715
train_xgboost_d7a59_00004,0.705
train_xgboost_d7a59_00005,0.675
train_xgboost_d7a59_00006,0.685
train_xgboost_d7a59_00007,0.75
train_xgboost_d7a59_00008,0.72
train_xgboost_d7a59_00009,0.77


2024-05-25 21:28:13,722	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Mieszko/ray_results/train_xgboost_2024-05-25_21-27-36' in 0.1590s.
2024-05-25 21:28:13,768	INFO tune.py:1039 -- Total run time: 37.23 seconds (37.02 seconds for the tuning loop).


Best config:  {'objective': 'binary:logistic', 'eval_metric': 'error', 'eta': 0.048824586372018405, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.8566605240280805, 'colsample_bytree': 0.7611904928890438, 'lambda': 0.003038230305666901, 'alpha': 0.36847847534391676}


In [10]:
df2=analysis.dataframe()
df2.to_csv('tune_results2.csv')

In [14]:
df2.sort_values('top_20_accuracy', ascending=False).head()

Unnamed: 0,top_20_accuracy,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,...,config/objective,config/eval_metric,config/eta,config/max_depth,config/min_child_weight,config/subsample,config/colsample_bytree,config/lambda,config/alpha,logdir
167,0.795,1716665281,,False,1,d7a59_00167,2024-05-25_21-28-01,0.07,0.07,24828,...,binary:logistic,error,0.088151,8,5,0.887521,0.941793,0.002818,1.643182,d7a59_00167
21,0.795,1716665260,,False,1,d7a59_00021,2024-05-25_21-27-40,0.068998,0.068998,30384,...,binary:logistic,error,0.048825,8,2,0.856661,0.76119,0.003038,0.368478,d7a59_00021
196,0.79,1716665287,,False,1,d7a59_00196,2024-05-25_21-28-07,0.072,0.072,16040,...,binary:logistic,error,0.064038,9,4,0.880959,0.584397,0.025367,0.057522,d7a59_00196
217,0.79,1716665289,,False,1,d7a59_00217,2024-05-25_21-28-09,0.047,0.047,21472,...,binary:logistic,error,0.094499,6,4,0.922602,0.646053,0.02661,0.606647,d7a59_00217
188,0.79,1716665285,,False,1,d7a59_00188,2024-05-25_21-28-05,0.061998,0.061998,43716,...,binary:logistic,error,0.065409,5,4,0.86263,0.73616,0.01777,0.001872,d7a59_00188
