In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [7]:
train_x, train_y = pd.read_csv('ProcessedData/X_train.csv', index_col=0), pd.read_csv('ProcessedData/Y_train.csv', index_col=False)
test_x, test_y = pd.read_csv('ProcessedData/X_test.csv', index_col=0), pd.read_csv('ProcessedData/Y_test.csv', index_col=False)

In [8]:
train_x = train_x.to_numpy()
train_y = train_y.to_numpy().flatten()
test_x = test_x.to_numpy()
test_y = test_y.to_numpy().flatten()

train_x, train_y

(array([[ 0.        , -0.05774286,  0.        , ...,  0.        ,
          0.        ,  1.        ],
        [ 0.        , -0.8249227 ,  0.        , ...,  0.        ,
          0.        ,  1.        ],
        [ 1.        , -0.05774286,  0.        , ...,  1.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        , -0.47620457,  0.        , ...,  0.        ,
          1.        ,  0.        ],
        [ 0.        ,  0.36071885,  0.        , ...,  0.        ,
          0.        ,  1.        ],
        [ 0.        , -0.05774286,  0.        , ...,  1.        ,
          0.        ,  0.        ]]),
 array([0., 0., 1., ..., 0., 0., 0.]))

In [5]:
import itertools

params = {"n_estimators": [100, 300, 1000], 
          "max_depth": [10, 30, 100],
          "min_samples_split": [2, 5, 10],
          "min_samples_leaf": [10, 30, 100]}

for (n_estimators, max_depth, min_samples_split, min_samples_leaf) in itertools.product(*params.values()):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42)
    model.fit(train_x, train_y)
    score = model.score(test_x, test_y)

    print(f"Score: {score}, n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf}")

Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 10
Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 30
Score: 0.7734330074755607, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 100
Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 10
Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 30
Score: 0.7734330074755607, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 100
Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 10
Score: 0.7906843013225991, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 30
Score: 0.7734330074755607, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 100
Score: 0.7906843013225991, n_estimators: 100, max_depth: 3

In [None]:
params = {"n_estimators": [100, 300, 1000], 
          "learning_rate": [0.1, 0.03, 0.01],
          "max_depth": [10, 30, 100],
          "min_samples_split": [2, 5, 10],
          "min_samples_leaf": [10, 30, 100]}

for (n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf) in itertools.product(*params.values()):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42)
    model.fit(train_x, train_y)
    score = model.score(test_x, test_y)

    print(f"Score: {score}, n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf}")

Score: 0.7981598619896493, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 10
Score: 0.8021851638872916, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 30
Score: 0.8039102932719954, n_estimators: 100, max_depth: 10, min_samples_split: 2, min_samples_leaf: 100
Score: 0.7981598619896493, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 10
Score: 0.8021851638872916, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 30
Score: 0.8039102932719954, n_estimators: 100, max_depth: 10, min_samples_split: 5, min_samples_leaf: 100
Score: 0.7981598619896493, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 10
Score: 0.8021851638872916, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 30
Score: 0.8039102932719954, n_estimators: 100, max_depth: 10, min_samples_split: 10, min_samples_leaf: 100
Score: 0.7849338700402531, n_estimators: 100, max_depth: 3

In [9]:
from joblib import dump

# train final model and save
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.03, max_depth=10, min_samples_split=2, min_samples_leaf=100, random_state=42)
model.fit(np.concatenate((train_x, test_x), axis=0), np.concatenate((train_y, test_y)))

dump(model, "GradientBoostingClassifier.joblib")

['GradientBoostingClassifier.joblib']