In [1]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the Breast Cancer Wisconsin dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
# Step 1: Train a basic Random Forest model
basic_rf = RandomForestClassifier(random_state=42)
basic_rf.fit(X_train, y_train)

In [4]:
# Make predictions and calculate performance metrics for the basic model
y_pred_basic = basic_rf.predict(X_test)
basic_accuracy = accuracy_score(y_test, y_pred_basic)
basic_precision = precision_score(y_test, y_pred_basic)
basic_recall = recall_score(y_test, y_pred_basic)
basic_f1 = f1_score(y_test, y_pred_basic)

print("Basic Random Forest Performance:")
print(f"Accuracy: {basic_accuracy:.3f}")
print(f"Precision: {basic_precision:.3f}")
print(f"Recall: {basic_recall:.3f}")
print(f"F1 Score: {basic_f1:.3f}")


Basic Random Forest Performance:
Accuracy: 0.971
Precision: 0.964
Recall: 0.991
F1 Score: 0.977


In [5]:
# Step 2: Hyperparameter tuning using RandomizedSearchCV
# Define the hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Initialize RandomizedSearchCV with Random Forest and parameter grid
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=30,  # 30 random combinations
    cv=3,       # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Fit the random search model
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_rf = random_search.best_estimator_

# Make predictions and calculate performance metrics for the tuned model
y_pred_best = best_rf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_best)
tuned_precision = precision_score(y_test, y_pred_best)
tuned_recall = recall_score(y_test, y_pred_best)
tuned_f1 = f1_score(y_test, y_pred_best)

In [8]:
print("\nTuned Random Forest Performance:")
print(f"Best Parameters: {random_search.best_params_}")
print(f"Accuracy: {basic_accuracy:.3f}")
print(f"Precision: {basic_precision:.3f}")
print(f"Recall: {basic_recall:.3f}")
print(f"F1 Score: {basic_f1:.3f}")


Tuned Random Forest Performance:
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'criterion': 'entropy', 'bootstrap': True}
Accuracy: 0.971
Precision: 0.964
Recall: 0.991
F1 Score: 0.977


In [9]:
print("Basic Random Forest Performance:")
print(f"Accuracy: {tuned_accuracy:.3f}")
print(f"Precision: {tuned_precision:.3f}")
print(f"Recall: {tuned_recall:.3f}")
print(f"F1 Score: {tuned_f1:.3f}")

Basic Random Forest Performance:
Accuracy: 0.965
Precision: 0.955
Recall: 0.991
F1 Score: 0.973


In [7]:
# Summary of both models
print("\nPerformance Comparison:")
print(f"{'Metric':<15} {'Tuned Model':<15} {'Basic Model':<15}")
print(f"{'-'*45}")
print(f"{'Accuracy':<15} {basic_accuracy:.3f} {' ' * 6} {tuned_accuracy:.3f}")
print(f"{'Precision':<15} {basic_precision:.3f} {' ' * 6} {tuned_precision:.3f}")
print(f"{'Recall':<15} {basic_recall:.3f} {' ' * 6} {tuned_recall:.3f}")
print(f"{'F1 Score':<15} {basic_f1:.3f} {' ' * 6} {tuned_f1:.3f}")


Performance Comparison:
Metric          Tuned Model     Basic Model    
---------------------------------------------
Accuracy        0.971        0.965
Precision       0.964        0.955
Recall          0.991        0.991
F1 Score        0.977        0.973
