In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [6]:
# Load dataset
df = pd.read_csv('/content/breast-cancer.csv')

# Drop unnecessary columns if present (like ID)
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Separate features & target
target_column = 'diagnosis'   # Change if different

X = df.drop(columns=[target_column])
y = df[target_column]

print("Dataset Shape:", df.shape)
print("Target Distribution:\n", y.value_counts())


Dataset Shape: (569, 31)
Target Distribution:
 diagnosis
B    357
M    212
Name: count, dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
default_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

default_pipeline.fit(X_train, y_train)

y_pred_default = default_pipeline.predict(X_test)


In [9]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}


In [10]:
grid_search = GridSearchCV(
    estimator=default_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


In [11]:
print("Best Parameters Found:")
print(grid_search.best_params_)


Best Parameters Found:
{'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}


In [12]:
y_pred_tuned = best_model.predict(X_test)


In [13]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n===== {model_name} =====")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted'))
    print("Recall   :", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score :", f1_score(y_true, y_pred, average='weighted'))
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred))


# Evaluate Default Model
evaluate_model(y_test, y_pred_default, "Default RandomForest")

# Evaluate Tuned Model
evaluate_model(y_test, y_pred_tuned, "Tuned RandomForest")



===== Default RandomForest =====
Accuracy : 0.9736842105263158
Precision: 0.9747368421052632
Recall   : 0.9736842105263158
F1 Score : 0.9734654095556351

Classification Report:

              precision    recall  f1-score   support

           B       0.96      1.00      0.98        72
           M       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114


===== Tuned RandomForest =====
Accuracy : 0.9736842105263158
Precision: 0.9747368421052632
Recall   : 0.9736842105263158
F1 Score : 0.9734654095556351

Classification Report:

              precision    recall  f1-score   support

           B       0.96      1.00      0.98        72
           M       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97  

In [14]:
import joblib
joblib.dump(best_model, "tuned_rf_model.pkl")


['tuned_rf_model.pkl']