In [7]:
# Breast Cancer Prediction using KNN - Complete Pipeline
# Wisconsin Diagnostic Breast Cancer (WDBC) Dataset

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')

print("Libraries imported successfully!")
print("=" * 50)

Libraries imported successfully!


# Step 1: Data Loading and Exploration

The Wisconsin Diagnostic Breast Cancer (WDBC) dataset contains features computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. The features describe characteristics of the cell nuclei present in the image.

In [8]:
# Load the WDBC dataset
print("Loading the Wisconsin Diagnostic Breast Cancer Dataset...")
data = load_breast_cancer()

# Create a DataFrame for better handling
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print(f"Dataset Shape: {df.shape}")
print(f"Number of features: {len(data.feature_names)}")
print(f"Target classes: {data.target_names}")
print(f"Target distribution:")
print(df['target'].value_counts())
print("\nClass distribution:")
for i, class_name in enumerate(data.target_names):
    count = sum(df['target'] == i)
    percentage = count / len(df) * 100
    print(f"{class_name}: {count} samples ({percentage:.1f}%)")

Loading the Wisconsin Diagnostic Breast Cancer Dataset...
Dataset Shape: (569, 31)
Number of features: 30
Target classes: ['malignant' 'benign']
Target distribution:
target
1    357
0    212
Name: count, dtype: int64

Class distribution:
malignant: 212 samples (37.3%)
benign: 357 samples (62.7%)


# Step 2: Data Preprocessing

Since the data is already clean (no missing values) and all features are numeric, we'll prepare the features and target variables for the KNN model.

In [13]:
# Prepare features and target variables
print("Preparing features and target variables...")

# Features (X) and target (y)
X = df.drop('target', axis=1)  # All features except target
y = df['target']  # Target variable

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {list(X.columns)}...")  # Show all feature names


Preparing features and target variables...
Features shape: (569, 30)
Target shape: (569,)
Feature names: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']...


# Step 3: Train-Test Split

We'll split the data before scaling to avoid data leakage.

In [10]:
# Split the data into training and testing sets
print("Splitting data into train and test sets...")

# Use stratified split to maintain class proportion
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,  # 80% for training, 20% for testing
    random_state=42,  # For reproducibility
    stratify=y  # Maintain class distribution
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"Training target distribution:")
print(y_train.value_counts(normalize=True).round(3))
print(f"Testing target distribution:")
print(y_test.value_counts(normalize=True).round(3))

Splitting data into train and test sets...
Training set size: (455, 30)
Testing set size: (114, 30)
Training target distribution:
target
1    0.626
0    0.374
Name: proportion, dtype: float64
Testing target distribution:
target
1    0.632
0    0.368
Name: proportion, dtype: float64


# Step 4: KNN Model Implementation

Let's implement KNN with different k values to see which performs best initially.

In [11]:
# YOUR CODE HERE

# Step 5: Model Evaluation

Now let's evaluate our best KNN model using various metrics to understand its performance comprehensively.

In [12]:
# YOUR CODE HERE