# Breast Cancer Prediction System - Model Development

**DISCLAIMER:** This system is strictly for educational purposes and must not be used as a medical diagnostic tool.

This notebook builds a machine learning model to predict breast cancer diagnosis (Benign/Malignant) using the Wisconsin Breast Cancer Dataset.

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## Step 2: Load the Dataset

In [None]:
# Load Breast Cancer Wisconsin (Diagnostic) dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['diagnosis'] = data.target  # 1 = Malignant, 0 = Benign

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nDiagnosis distribution:")
print(df['diagnosis'].value_counts())

## Step 3: Check and Handle Missing Values

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print("\nTotal missing values:", df.isnull().sum().sum())

if df.isnull().sum().sum() == 0:
    print("✓ No missing values found!")
else:
    print("Handling missing values...")
    df = df.fillna(df.mean())

## Step 4: Select 5 Input Features

In [None]:
# Selected features (from the allowed list)
selected_features = [
    'radius_mean',
    'texture_mean',
    'perimeter_mean',
    'area_mean',
    'smoothness_mean'
]

print("Selected Features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

# Prepare features and target
X = df[selected_features]
y = df['diagnosis']

print(f"\nInput features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print("\nFeatures statistics:")
print(X.describe())

## Step 5: Encode Target Variable (Benign/Malignant)

In [None]:
# Target encoding mapping
target_mapping = {0: 'Benign', 1: 'Malignant'}
print("Target Variable Encoding:")
print("0 = Benign (absence of cancer)")
print("1 = Malignant (presence of cancer)")
print(f"\nTarget distribution:")
print(f"Benign (0): {(y == 0).sum()} samples")
print(f"Malignant (1): {(y == 1).sum()} samples")

## Step 6: Apply Feature Scaling (Mandatory)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Apply StandardScaler for feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✓ Feature scaling completed using StandardScaler")
print(f"\nScaled training data - Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")
print(f"Scaled testing data - Mean: {X_test_scaled.mean():.4f}, Std: {X_test_scaled.std():.4f}")

## Step 7: Train Machine Learning Model (Logistic Regression)

In [None]:
# Train Logistic Regression model
print("Training Logistic Regression Model...\n")

model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    solver='lbfgs'
)

model.fit(X_train_scaled, y_train)

print("✓ Model training completed!")
print(f"\nModel Algorithm: Logistic Regression")
print(f"Model Parameters:")
print(f"  - Intercept: {model.intercept_[0]:.4f}")
print(f"  - Coefficients: {model.coef_[0]}")

## Step 8: Evaluate Model Performance

In [None]:
# Make predictions on training and testing sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics
print("="*60)
print("MODEL EVALUATION METRICS")
print("="*60)

print("\nTRAINING SET PERFORMANCE:")
print("-" * 60)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

print(f"Accuracy:  {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

print("\nTESTING SET PERFORMANCE:")
print("-" * 60)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

print("\nCONFUSION MATRIX (Test Set):")
print("-" * 60)
cm = confusion_matrix(y_test, y_test_pred)
print(f"\n[[True Negative  False Positive]")
print(f" [False Negative True Positive ]]")
print(f"\n{cm}")

print("\nCLASSIFICATION REPORT (Test Set):")
print("-" * 60)
print(classification_report(y_test, y_test_pred, target_names=['Benign', 'Malignant']))

## Step 9: Save Model Using Joblib

In [None]:
import os

# Create model directory if it doesn't exist
model_dir = 'model'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the model
model_path = os.path.join(model_dir, 'breast_cancer_model.pkl')
joblib.dump(model, model_path)

print(f"✓ Model saved successfully at: {model_path}")
print(f"Model file size: {os.path.getsize(model_path) / 1024:.2f} KB")

# Also save the scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"✓ Scaler saved successfully at: {scaler_path}")

## Step 10: Reload Model and Demonstrate Prediction (Without Retraining)

In [None]:
# Load the saved model
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)

print("✓ Model and scaler loaded successfully!")
print(f"\nLoaded Model Type: {type(loaded_model).__name__}")
print(f"Loaded Scaler Type: {type(loaded_scaler).__name__}")

## Step 11: Make Predictions Using Reloaded Model

In [None]:
# Test predictions with reloaded model
print("="*60)
print("TESTING RELOADED MODEL WITH SAMPLE PREDICTIONS")
print("="*60)

# Select some test samples
sample_indices = [0, 10, 20, 30, 40]
X_test_samples = X_test_scaled[sample_indices]
y_test_samples = y_test.iloc[sample_indices].values

# Get predictions from reloaded model
y_pred_samples = loaded_model.predict(X_test_samples)
y_pred_proba = loaded_model.predict_proba(X_test_samples)

print("\nSample Predictions:")
print("-" * 60)
for i, idx in enumerate(sample_indices):
    actual = 'Malignant' if y_test_samples[i] == 1 else 'Benign'
    predicted = 'Malignant' if y_pred_samples[i] == 1 else 'Benign'
    confidence = max(y_pred_proba[i]) * 100
    match = "✓" if actual == predicted else "✗"
    print(f"{match} Sample {i+1}: Actual={actual:10s} | Predicted={predicted:10s} | Confidence={confidence:.2f}%")

# Validate with entire test set
y_pred_reload = loaded_model.predict(X_test_scaled)
reload_accuracy = accuracy_score(y_test, y_pred_reload)

print("\n" + "="*60)
print(f"Reloaded Model Test Accuracy: {reload_accuracy:.4f} ({reload_accuracy*100:.2f}%)")
print("✓ Predictions successful without model retraining!")
print("="*60)

## Summary

### Project Completion Status:

✓ **Dataset Loaded**: Breast Cancer Wisconsin (Diagnostic) dataset  
✓ **Missing Values**: Checked and handled (none found)  
✓ **Features Selected**: 5 features - radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean  
✓ **Target Encoding**: 0=Benign, 1=Malignant  
✓ **Feature Scaling**: StandardScaler applied  
✓ **Algorithm**: Logistic Regression  
✓ **Model Evaluation**: Accuracy, Precision, Recall, F1-Score calculated  
✓ **Model Saved**: breast_cancer_model.pkl using Joblib  
✓ **Model Reloaded**: Successfully demonstrated without retraining  

### DISCLAIMER:
**This system is strictly for educational purposes and must not be used as a medical diagnostic tool.**