In [None]:
from IPython.display import HTML
HTML(open("../style.css", "r").read())

# SVM on Medical Data: Breast Cancer Diagnosis

In this notebook, we apply our "First Principles" Support Vector Machine to the **Breast Cancer Wisconsin (Diagnostic) Dataset**. This is a classic binary classification dataset used to predict whether a breast mass is **Malignant** (harmful) or **Benign** (non-harmful) based on characteristics of cell nuclei present in a digitized image of a fine needle aspirate (FNA).

---

## The Challenge: Feature Scaling

Unlike the Iris dataset, this real-world data presents a common challenge for SVMs: **Vastly different scales**.

* **Mean Area** might range from 200 to 2500.
* **Mean Smoothness** might range from 0.05 to 0.16.

The Gaussian Kernel depends on the Euclidean distance $||\mathbf{x} - \mathbf{y}||^2$. If we don't normalize the data, the "Area" feature (with differences in the hundreds) will completely drown out the "Smoothness" feature (with differences in the hundredths). The SVM would essentially ignore the small features.

Therefore, we will implement **Standardization** (Z-score normalization) from scratch before training.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
import scipy.optimize as scp

# Set visual style
sns.set_theme(style="whitegrid")

## 1. SVM Class Implementation

We use our established implementation with the Gaussian Kernel and Soft Margin support.

In [None]:
def gaussian_kernel(x1, x2, sigma=1.0):
    dist = np.linalg.norm(x1 - x2) ** 2
    return np.exp(-dist / (2 * (sigma ** 2)))

In [None]:
class SVM:
    def __init__(self, kernel=gaussian_kernel, C=1.0):
        self.kernel = kernel
        self.C = C
        self.alpha = None
        self.support_vectors = None
        self.support_vector_labels = None
        self.b = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i, j] = self.kernel(X[i], X[j])
        def objective(alpha):
            return 0.5 * np.sum((np.outer(alpha, alpha) * \
                   np.outer(y, y)) * K) - np.sum(alpha)
        constraints = ({'type': 'eq', 'fun': lambda alpha: np.dot(alpha, y)})
        bounds      = [(0, self.C) for _ in range(n_samples)]
        initial_alpha = np.zeros(n_samples)
        result = scp.minimize(objective, initial_alpha, method='SLSQP', 
                              bounds=bounds, constraints=constraints)
        self.alpha = result.x
        sv_indices = self.alpha > 1e-4
        self.support_vectors = X[sv_indices]
        self.support_vector_labels = y[sv_indices]
        self.alpha = self.alpha[sv_indices]
        self.compute_bias()
    
    def compute_bias(self):
        free_sv_indices = (self.alpha < self.C - 1e-4)
        if np.any(free_sv_indices):
            b_values = []
            free_alphas = self.alpha[free_sv_indices]
            free_vecs   = self.support_vectors[free_sv_indices]
            free_labels = self.support_vector_labels[free_sv_indices]
            for i in range(len(free_alphas)):
                sum_term = 0
                for j in range(len(self.alpha)):
                    sum_term += self.alpha[j] * self.support_vector_labels[j] * \
                                self.kernel(self.support_vectors[j], free_vecs[i])
                b_values.append(free_labels[i] - sum_term)
            self.b = np.mean(b_values)
        else:
            self.b = 0

    def predict(self, X):
        y_pred = []
        for x in X:
            prediction = 0
            for i in range(len(self.alpha)):
                prediction += self.alpha[i] * self.support_vector_labels[i] * \
                              self.kernel(self.support_vectors[i], x)
            y_pred.append(prediction + self.b)
        return np.sign(y_pred)

## 2. Data Preparation

We load the dataset and select two features to allow for 2D visualization.

**Selected Features:**
1.  **Mean Radius** (Feature 0)
2.  **Mean Texture** (Feature 1)

These two features have significant overlap, making a linear boundary insufficient.

In [None]:
# Load Breast Cancer Data
data = load_breast_cancer()
X_full = data.data
y_full = data.target

# Select only the first two features: Mean Radius and Mean Texture
X = X_full[:, :2]

# Map Labels:
# 0 (Malignant) -> -1
# 1 (Benign) -> 1
y = np.where(y_full == 0, -1, 1)

# Subsample for speed (Optimization is O(N^2) or O(N^3))
# We take 200 samples to keep computation under 30 seconds
np.random.seed(42)
indices = np.random.choice(len(y), 200, replace=False)
X = X[indices]
y = y[indices]

print(f"Data Shape: {X.shape}")
print(f"Example Sample (Raw): {X[0]}")

## 3. Standardization (Normalization)

Here we implement the crucial step of scaling our data. We transform the data so that each feature has a mean of 0 and a standard deviation of 1.

$$ x_{scaled} = \frac{x - \mu}{\sigma} $$

In [None]:
# Calculate Mean and Std Deviation manually
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)

# Apply Z-score normalization
X_scaled = (X - mean) / std

print(f"Example Sample (Scaled): {X_scaled[0]}")

## 4. Training the SVM

We use the **Gaussian Kernel** with `sigma=0.7` and `C=2.0`.  

Because the classes overlap significantly (cancer diagnosis is rarely clear-cut on just two features), we expect a large number of support vectors.

In [None]:
# Helper to inject sigma into the kernel function
def rbf_wrapper(x1, x2):
    return gaussian_kernel(x1, x2, sigma=0.7)

# Train SVM
# C=2.0 allows some flexibility for misclassified points (Soft Margin)
svm = SVM(kernel=rbf_wrapper, C=1.0)
svm.fit(X_scaled, y)

print(f"Training Complete. Found {len(svm.support_vectors)} support vectors out of {len(y)} samples.")

## 5. Visualizing the Decision Boundary

The plot below visualizes the diagnosis logic.
* **Red Regions**: Predicted Malignant (-1)
* **Blue Regions**: Predicted Benign (+1)
* **Circled Points**: Support Vectors (the ambiguous cases that define the boundary)

In [None]:
def plot_boundary(model, X, y):
    plt.figure(figsize=(10, 6))
    
    # Scatter Plot
    # Red for Malignant (-1), Blue for Benign (1)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm_r', s=50, edgecolors='k')

    # Highlight Support Vectors
    plt.scatter(model.support_vectors[:, 0], model.support_vectors[:, 1],
                s=150, facecolors='none', edgecolors='k', linewidth=1.5, label='Support Vectors')

    # Create Grid
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xx = np.linspace(xlim[0], xlim[1], 40)
    yy = np.linspace(ylim[0], ylim[1], 40)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T

    # Predict
    Z = []
    for x_sample in xy:
        # Calculate prediction score manually for contour levels
        prediction = 0
        for i in range(len(model.alpha)):
            prediction += model.alpha[i] * model.support_vector_labels[i] * \
                          model.kernel(model.support_vectors[i], x_sample)
        Z.append(prediction + model.b)
    Z = np.array(Z).reshape(XX.shape)

    # Contour
    ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])

    plt.title('Breast Cancer Diagnosis (SVM RBF Kernel)\nMean Radius vs. Mean Texture')
    plt.xlabel('Mean Radius (Standardized)')
    plt.ylabel('Mean Texture (Standardized)')
    plt.legend(loc='upper right')
    plt.show()

plot_boundary(svm, X_scaled, y)

## Conclusion

You can see that the SVM has learned a **curved decision boundary**. 

It has effectively identified a region (bottom-left) corresponding to smaller, smoother cells which are typically **Benign** (Blue), and separated them from the larger, more textured cells which are typically **Malignant** (Red). 

The **Standardization** step was critical here; without it, the "Radius" axis would have dominated the distance calculation, and the vertical nuance of "Texture" would have been lost.