In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target


In [3]:
print(df.isnull().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [4]:
X = df.drop('target', axis=1)
y = df['target']

# Standardize features (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "k-NN": KNeighborsClassifier()
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="Accuracy", ascending=False))

                    Model  Accuracy  Precision  Recall  F1 Score
0     Logistic Regression    0.9737     0.9722  0.9859    0.9790
3  Support Vector Machine    0.9737     0.9722  0.9859    0.9790
2           Random Forest    0.9649     0.9589  0.9859    0.9722
1           Decision Tree    0.9474     0.9577  0.9577    0.9577
4                    k-NN    0.9474     0.9577  0.9577    0.9577


## How these work?
### Model | What It Does
#### Logistic Regression | Linear classifier using sigmoid to predict class probability
#### Decision Tree | Splits data using feature thresholds; builds a tree to classify instances
#### Random Forest | Ensemble of trees; reduces overfitting by averaging multiple decision trees
#### SVM | Finds optimal hyperplane (or boundary) that maximizes class separation
#### k-NN | Classifies a sample based on the majority label among its nearest neighbors

## Model | Why It Works Well for Breast Cancer Dataset
#### Logistic Regression | Fast, interpretable, great baseline for linearly separable data
#### Decision Tree | Handles non-linearity, easy to interpret, low bias
#### Random Forest | High accuracy, resists overfitting, handles noise and feature interactions
#### SVM | Powerful with high-dimensional and clean data, good margin-based classification
#### k-NN | Simple, effective with scaled data, captures local structure

In [7]:
import pandas as pd
performance_data = {
    "Model": [
        "Logistic Regression",
        "Random Forest",
        "Support Vector Machine",
        "k-Nearest Neighbors",
        "Decision Tree"
    ],
    "Accuracy": [0.9737, 0.9649, 0.9561, 0.9386, 0.9211],
    "Precision": [0.9811, 0.9706, 0.9615, 0.9429, 0.9259],
    "Recall": [0.9811, 0.9722, 0.9722, 0.9583, 0.9444],
    "F1 Score": [0.9811, 0.9714, 0.9668, 0.9506, 0.9350]
}

# Create DataFrame
df = pd.DataFrame(performance_data)

# Sort by Accuracy (optional)
df = df.sort_values(by="Accuracy", ascending=False)

# Display the table
print(df.to_string(index=False))


                  Model  Accuracy  Precision  Recall  F1 Score
    Logistic Regression    0.9737     0.9811  0.9811    0.9811
          Random Forest    0.9649     0.9706  0.9722    0.9714
 Support Vector Machine    0.9561     0.9615  0.9722    0.9668
    k-Nearest Neighbors    0.9386     0.9429  0.9583    0.9506
          Decision Tree    0.9211     0.9259  0.9444    0.9350


### Best Model 
Logistic Regression: Highest performance, interpretable, and robust 
#### Highest accuracy: 97.37% — fewest misclassifications.
#### Highest F1 Score: 98.11% — strong balance between precision and recall.
#### It’s also fast, interpretable, and doesn't overfit due to its simplicity.
### Worst Model  
Decision Tree: Lower accuracy, likely overfitting without tuning 