In [None]:
# breast_cancer_classification.py
# 🧬 Breast Cancer Classification (UCI Wisconsin Diagnostic)
# Goal: Predict benign vs malignant from public UCI dataset.
# Falls back to sklearn's bundled dataset if the UCI URL isn't reachable.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

RANDOM_STATE = 100

In [None]:
def load_uci_or_fallback():
    """Load UCI breast cancer CSV; if not available, fall back to sklearn dataset."""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
    columns = ["ID", "Diagnosis"] + [f"feature_{i}" for i in range(1, 31)]
    try:
        df = pd.read_csv(url, header=None, names=columns)
        df = df.drop(columns=["ID"])
        source = "UCI CSV"
    except Exception:
        from sklearn.datasets import load_breast_cancer
        data = load_breast_cancer(as_frame=True)
        df = data.frame.copy()
        # Align schema: use a "Diagnosis" column as M/B like UCI
        df = df.rename(columns={"target": "Diagnosis"})
        # sklearn: target 0=malignant, 1=benign
        df["Diagnosis"] = df["Diagnosis"].map({0: "M", 1: "B"})
        source = "sklearn fallback"
    return df, source

In [None]:
# 1) Load data
df, source = load_uci_or_fallback()
print("Data source:", source)
print(df.head())
print("\nDiagnosis counts:\n", df["Diagnosis"].value_counts())

In [None]:
# 2) Preprocessing: encode target and scale features
y = (df["Diagnosis"] == "M").astype(int)   # M->1, B->0
X = df.drop(columns=["Diagnosis"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2
)

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)


In [None]:
# 3) Logistic Regression (no PCA)
lr = LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)
lr.fit(X_train_s, y_train)
print("\n=== Logistic Regression (No PCA) ===")
print("Train accuracy:", lr.score(X_train_s, y_train))
print("Test  accuracy:", lr.score(X_test_s, y_test))
print("\nClassification report (no PCA):\n",
      classification_report(y_test, lr.predict(X_test_s),
                            target_names=["Benign (0)","Malignant (1)"]))

ConfusionMatrixDisplay.from_estimator(lr, X_test_s, y_test)
plt.title("Confusion Matrix (Logistic Regression, no PCA)")
plt.show()