In [3]:
#Import scikit-learn dataset library
from sklearn import datasets
#Load dataset
cancer = datasets.load_breast_cancer()

In [4]:
X = cancer.data
y = cancer.target

print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [5]:
#Splitting into training and test sets, 80% training, 20% test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

In [10]:
#Baseline Random Forest
#Import Random Forest
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

print("Baseline model trained")

Baseline model trained


In [11]:
#Baseline evaluation of accuracy

from sklearn.metrics import accuracy_score

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Baseline model accuracy:", accuracy)

Baseline model accuracy: 0.956140350877193


In [12]:
#Set up parameter grid

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "max_features": ["sqrt", "log2"]
}

In [15]:
#GridSearchCV with k-fold cross-validation
from sklearn.model_selection import GridSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

Best parameters: {'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}
Best CV accuracy: 0.964835164835165


In [18]:
#Evaluating tuned model on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Tuned Test accuracy:", accuracy_score(y_test, y_pred))

Tuned Test accuracy: 0.956140350877193


In [19]:
#Confusion matrix to calculate sensitivity and specificity
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[39  3]
 [ 2 70]]


In [21]:
#Calculate sensitivity & specificity

tp = cm[0,0]
fn = cm[0,1]
fp = cm[1,0]
tn = cm[1,1]

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Sensitivity: 0.9285714285714286
Specificity: 0.9722222222222222


In [22]:
#Calculate Precision, Recall, F1

from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, pos_label=0)
recall = recall_score(y_test, y_pred, pos_label=0)
f1 = f1_score(y_test, y_pred, pos_label=0)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 0.9512195121951219
Recall: 0.9285714285714286
F1 Score: 0.9397590361445783


In [23]:
#Summarize metrics for assignment

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#Prediction from tuned model
y_pred = grid_search.best_estimator_.predict(X_test)

#Confusion matrix
cm = confusion_matrix(y_test, y_pred)

tp = cm[0,0]
fn = cm[0,1]
fp = cm[1,0]
tn = cm[1,1]

#Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=0)
recall = recall_score(y_test, y_pred, pos_label=0)
f1 = f1_score(y_test, y_pred, pos_label=0)
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

#Final print
print("=== Final Results===")
print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

=== Final Results===
Confusion Matrix:
 [[39  3]
 [ 2 70]]
Accuracy: 0.956140350877193
Precision: 0.9512195121951219
Recall: 0.9285714285714286
F1 Score: 0.9397590361445783
Sensitivity: 0.9285714285714286
Specificity: 0.9722222222222222
