In [2]:
# directly importing data set from sklear
from sklearn.datasets import load_breast_cancer


data = load_breast_cancer()

X = data.data
y = data.target

print("Data loaded!")
print("Shape of X:", X.shape) #printing shape of x and y
print("Shape of y:", y.shape)


Data loaded!
Shape of X: (569, 30)
Shape of y: (569,)


In [3]:

from sklearn.model_selection import train_test_split

#doing train test split for training model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData splitting done.")
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])



Data splitting done.
Training set size: 455
Testing set size: 114


In [5]:
#sclaing kr rhae hai so model ache se train ho ske
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nScaling ho gya .")



Scaling ho gya .


In [6]:
# Now we want to train some models and compare them


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

results = []


for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"{name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print("-" * 40)

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    })



Training different models...

Logistic Regression
  Accuracy: 0.9825
  Precision: 0.9861
  Recall: 0.9861
  F1-Score: 0.9861
----------------------------------------
Random Forest
  Accuracy: 0.9474
  Precision: 0.9583
  Recall: 0.9583
  F1-Score: 0.9583
----------------------------------------
SVM
  Accuracy: 0.9825
  Precision: 0.9861
  Recall: 0.9861
  F1-Score: 0.9861
----------------------------------------
KNN
  Accuracy: 0.9561
  Precision: 0.9589
  Recall: 0.9722
  F1-Score: 0.9655
----------------------------------------
Decision Tree
  Accuracy: 0.9123
  Precision: 0.9429
  Recall: 0.9167
  F1-Score: 0.9296
----------------------------------------
Naive Bayes
  Accuracy: 0.9298
  Precision: 0.9444
  Recall: 0.9444
  F1-Score: 0.9444
----------------------------------------


In [7]:
# Now we want to make a table .
import pandas as pd

results_df = pd.DataFrame(results)

print("\n=== Model Comparison ===")
print(results_df.sort_values(by='F1-Score', ascending=False))



=== Model Comparison ===
                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.982456   0.986111  0.986111  0.986111
2                  SVM  0.982456   0.986111  0.986111  0.986111
3                  KNN  0.956140   0.958904  0.972222  0.965517
1        Random Forest  0.947368   0.958333  0.958333  0.958333
5          Naive Bayes  0.929825   0.944444  0.944444  0.944444
4        Decision Tree  0.912281   0.942857  0.916667  0.929577


In [8]:
#tuning

from sklearn.model_selection import GridSearchCV


rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 3, 5],
    'min_samples_split': [2, 5]
}


rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1
)

rf_grid.fit(X_train_scaled, y_train)

print("\nBest parameters for Random Forest:")
print(rf_grid.best_params_)



Best parameters for Random Forest:
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}


In [9]:

from sklearn.metrics import classification_report

rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_scaled)

print("\nClassification Report for Tuned Random Forest:")
print(classification_report(y_test, y_pred_rf, target_names=data.target_names))



Classification Report for Tuned Random Forest:
              precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        42
      benign       0.96      0.96      0.96        72

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [10]:
#svm tune

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

svm_param_dist = {
    'C': np.logspace(-3, 3, 10),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svm_random = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=svm_param_dist,
    n_iter=10,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    random_state=42
)

svm_random.fit(X_train_scaled, y_train)

print("\nBest parameters for SVM:")
print(svm_random.best_params_)



Best parameters for SVM:
{'kernel': 'linear', 'gamma': 'auto', 'C': np.float64(2.154434690031882)}


In [11]:
# Evaluate

svm_best = svm_random.best_estimator_
y_pred_svm = svm_best.predict(X_test_scaled)

print("\nClassification Report for Tuned SVM:")
print(classification_report(y_test, y_pred_svm, target_names=data.target_names))



Classification Report for Tuned SVM:
              precision    recall  f1-score   support

   malignant       0.91      0.98      0.94        42
      benign       0.99      0.94      0.96        72

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [13]:
#compare

final_models = {
    'Random Forest (Tuned)': rf_best,
    'SVM (Tuned)': svm_best
}

final_results = []

for name, model in final_models.items():
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1-Score: {f1:.4f}")

    final_results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    })

final_df = pd.DataFrame(final_results)

print("\n=== Final Tuned Models Comparison ===")
print(final_df.sort_values(by='F1-Score', ascending=False))
# which model is best
best_model_name = final_df.sort_values(by='F1-Score', ascending=False).iloc[0]['Model']
print(f"\n The best model after tuning is: {best_model_name}")



Random Forest (Tuned)
  Accuracy: 0.9474
  Precision: 0.9583
  Recall: 0.9583
  F1-Score: 0.9583

SVM (Tuned)
  Accuracy: 0.9561
  Precision: 0.9855
  Recall: 0.9444
  F1-Score: 0.9645

=== Final Tuned Models Comparison ===
                   Model  Accuracy  Precision    Recall  F1-Score
1            SVM (Tuned)  0.956140   0.985507  0.944444  0.964539
0  Random Forest (Tuned)  0.947368   0.958333  0.958333  0.958333

 The best model after tuning is: SVM (Tuned)
