In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

In [3]:
X = pd.read_csv("data/X.csv", index_col=0).values
y = pd.read_csv("data/y.csv", index_col=0)
y = y["class"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    # ('scaler', StandardScaler()), 
    ('rf', RandomForestClassifier())
])

In [12]:
param_grid = {
    'rf__n_estimators': [100, 200, 300], 
    'rf__max_depth': [None, 10, 20, 30], 
    'rf__min_samples_split': [2, 5, 10], 
    'rf__min_samples_leaf': [1, 2, 4],  
    'rf__max_features': [None, 'sqrt', 'log2'], 
    'rf__bootstrap': [True, False],  
    'rf__criterion': ['gini', 'entropy'],
    'rf__class_weight': [None, "balanced", "balanced_subsample", {0:5, 1:1}]
}

In [13]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [14]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)

print("Best Parameters:", best_params)
print("Best Cross-validation Score:", best_score)
print("Test Set Score with Best Parameters:", test_score)

Best Parameters: {'rf__bootstrap': True, 'rf__class_weight': 'balanced_subsample', 'rf__criterion': 'entropy', 'rf__max_depth': 20, 'rf__max_features': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 100}
Best Cross-validation Score: 0.7737499999999999
Test Set Score with Best Parameters: 0.78
