In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer


In [5]:
# load Dataset
data = load_breast_cancer()
x = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [6]:
x.shape

(569, 30)

In [7]:
y.shape

(569,)

In [10]:
# Train-Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [11]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

In [12]:
rf = RandomForestClassifier(n_estimators=100)

In [13]:
rf

In [14]:
# Create Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [15]:
rf.fit(x_train, y_train)

In [16]:
rf.score(x_test, y_test)

0.9707602339181286

In [18]:
y_pred = rf.predict(x_test)

In [19]:
# Evaluate
print("Accuracy (Basic Model):", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy (Basic Model): 0.9707602339181286
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



In [23]:
param_grid = {
    'n_estimators': [50, 100, 200],        # number of trees
    'max_depth': [None, 5, 10, 20],        # depth of trees
    'min_samples_split': [2, 5, 10],       # split criteria
    'min_samples_leaf': [1, 2, 4],         # leaf nodes
    'max_features': ['sqrt', 'log2']       # feature selection
}

In [24]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [25]:
# GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,              # ❗ rf because Random Forest
    param_grid=param_grid,
    cv=5,                      # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,                 # use all CPU cores
    verbose=2
)

In [26]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [27]:
print("Best Score on Training Set:", grid_search.best_score_)

Best Score on Training Set: 0.9572468354430379


In [28]:
grid_search.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200}

In [29]:
# Best estimator from GridSearch
best_rf = grid_search.best_estimator_

# Predict
y_pred_best = best_rf.predict(x_test)

In [30]:
# Evaluate
print("Accuracy (Tuned Model):", accuracy_score(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))

Accuracy (Tuned Model): 0.9707602339181286
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171

