In [1]:
import pandas as pd

# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())


x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

In [2]:
vars=[100, 101, 102, 103, 104, 105, 6]

In [3]:
X = X[vars]

In [6]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Assuming your data is in X and y dataframes
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def train_random_forest(config):
    # Initialize the model with the given hyperparameters
    model = RandomForestClassifier(
        n_estimators=config["n_estimators"],
        max_depth=config["max_depth"],
        min_samples_split=config["min_samples_split"],
        min_samples_leaf=config["min_samples_leaf"],
        max_features=config["max_features"],
        bootstrap=config["bootstrap"]
    )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    preds_proba = model.predict_proba(X_val)[:, 1]
    
    # Get the indices of the top 20% predictions
    top_20_percent_indices = np.argsort(preds_proba)[-int(0.2 * len(preds_proba)):]
    
    # Select the corresponding true labels and predicted labels for top 20%
    top_20_true = y_val.iloc[top_20_percent_indices]
    top_20_preds = (preds_proba[top_20_percent_indices] >= 0.5).astype(int)
    
    # Calculate accuracy for the top 20%
    top_20_accuracy = accuracy_score(top_20_true, top_20_preds)
    report({"top_20_accuracy":top_20_accuracy})

# Define the search space
search_space = {
    "n_estimators": tune.randint(50, 200),
    "max_depth": tune.randint(3, 20),
    "min_samples_split": tune.randint(2, 10),
    "min_samples_leaf": tune.randint(1, 10),
    "max_features": tune.choice(["sqrt", "log2"]),
    "bootstrap": tune.choice([True, False])
}

# Set up the scheduler
scheduler = ASHAScheduler(
    metric="top_20_accuracy",
    mode="max",
    max_t=10,
    grace_period=1,
    reduction_factor=2
)

# Function to create shorter directory names
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"

# Run the hyperparameter search
analysis = tune.run(
    train_random_forest,
    resources_per_trial={"cpu": 1, "gpu": 0},
    config=search_space,
    num_samples=1000,
    scheduler=scheduler,
    trial_dirname_creator=trial_dirname_creator
)

# Get the best result
best_config = analysis.get_best_config(metric="top_20_accuracy", mode="max")
print("Best config: ", best_config)


2024-05-25 21:45:14,141	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-05-25 21:48:06
Running for:,00:02:52.65
Memory:,19.0/63.8 GiB

Trial name,status,loc,bootstrap,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,iter,total time (s),top_20_accuracy
train_random_forest_4e078_00000,TERMINATED,127.0.0.1:38888,False,4,sqrt,4,4,94,1,0.405997,0.745
train_random_forest_4e078_00001,TERMINATED,127.0.0.1:49416,False,5,sqrt,6,6,141,1,0.682998,0.755
train_random_forest_4e078_00002,TERMINATED,127.0.0.1:22456,False,5,log2,1,8,56,1,0.276999,0.74
train_random_forest_4e078_00003,TERMINATED,127.0.0.1:16784,False,16,sqrt,9,4,75,1,0.782998,0.75
train_random_forest_4e078_00004,TERMINATED,127.0.0.1:39788,True,16,sqrt,5,6,130,1,0.981997,0.755
train_random_forest_4e078_00005,TERMINATED,127.0.0.1:11796,False,11,log2,9,9,138,1,1.12,0.75
train_random_forest_4e078_00006,TERMINATED,127.0.0.1:9452,False,7,sqrt,7,6,51,1,0.341996,0.745
train_random_forest_4e078_00007,TERMINATED,127.0.0.1:4136,True,7,sqrt,3,5,61,1,0.320002,0.79
train_random_forest_4e078_00008,TERMINATED,127.0.0.1:50572,False,5,sqrt,4,3,153,1,0.772998,0.75
train_random_forest_4e078_00009,TERMINATED,127.0.0.1:38940,False,18,sqrt,8,2,79,1,0.877003,0.765


Trial name,top_20_accuracy
train_random_forest_4e078_00000,0.745
train_random_forest_4e078_00001,0.755
train_random_forest_4e078_00002,0.74
train_random_forest_4e078_00003,0.75
train_random_forest_4e078_00004,0.755
train_random_forest_4e078_00005,0.75
train_random_forest_4e078_00006,0.745
train_random_forest_4e078_00007,0.79
train_random_forest_4e078_00008,0.75
train_random_forest_4e078_00009,0.765


2024-05-25 21:48:06,792	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Mieszko/ray_results/train_random_forest_2024-05-25_21-45-14' in 0.5535s.
2024-05-25 21:48:06,906	INFO tune.py:1039 -- Total run time: 172.77 seconds (172.09 seconds for the tuning loop).


Best config:  {'n_estimators': 100, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True}


In [8]:
df=analysis.dataframe()
df.to_csv('rf_raytune.csv')

In [10]:
df.sort_values(by='top_20_accuracy', ascending=False, inplace=True)
print(df.head())

     top_20_accuracy   timestamp checkpoint_dir_name   done  \
221            0.805  1716666352                None  False   
392            0.800  1716666380                None  False   
481            0.795  1716666396                None  False   
275            0.790  1716666362                None  False   
354            0.790  1716666375                None  False   

     training_iteration     trial_id                 date  time_this_iter_s  \
221                   1  4e078_00221  2024-05-25_21-45-52          0.838000   
392                   1  4e078_00392  2024-05-25_21-46-20          0.325000   
481                   1  4e078_00481  2024-05-25_21-46-36          0.852999   
275                   1  4e078_00275  2024-05-25_21-46-02          0.565002   
354                   1  4e078_00354  2024-05-25_21-46-15          1.270998   

     time_total_s    pid  ...    node_ip time_since_restore  \
221      0.838000   2960  ...  127.0.0.1           0.838000   
392      0.325000  1