# UCI Breast Cancer Dataset - Preprocessing

**Project**: Breast Cancer Detection using Machine Learning

**Dataset**: UCI Breast Cancer Wisconsin (Diagnostic) Dataset

This notebook performs the following preprocessing steps:
1. Load the dataset
2. Exploratory Data Analysis (EDA)
3. Check for missing and inconsistent values
4. Encode target labels (M=Malignant, B=Benign)
5. Standardize features using StandardScaler
6. Split dataset into training and testing sets


## 1. Import Required Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")


## 2. Load the Dataset

We'll load the UCI Breast Cancer Wisconsin (Diagnostic) dataset from the wdbc.data file.


In [None]:
# Define feature names based on UCI dataset documentation
# 10 base features × 3 (mean, standard error, worst) = 30 features
base_features = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 
                 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']

feature_names = []
for stat in ['mean', 'se', 'worst']:
    for feature in base_features:
        feature_names.append(f"{feature}_{stat}")

# Define all column names (ID, Diagnosis, 30 features)
column_names = ['id', 'diagnosis'] + feature_names

# Load the dataset from wdbc.data file
df = pd.read_csv('uci_breast_cancer_dataset/wdbc.data', header=None, names=column_names)

print(f"✓ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {len(df.columns)}")
print(f"  - ID: 1")
print(f"  - Diagnosis: 1")
print(f"  - Features: {len(feature_names)}")
print(f"\nTotal samples: {len(df)}")
print(f"\nFirst few rows:")
df.head()


## 3. Exploratory Data Analysis (EDA)


In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()


In [None]:
# Display dataset info
print("Dataset Information:")
df.info()


In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()


In [None]:
# Check target distribution
print("Target Distribution:")
print(df['diagnosis'].value_counts())
print(f"\nPercentage:")
print(df['diagnosis'].value_counts(normalize=True) * 100)

# Visualize target distribution
plt.figure(figsize=(8, 6))
df['diagnosis'].value_counts().plot(kind='bar', color=['#ff6b6b', '#4ecdc4'])
plt.title('Distribution of Diagnosis', fontsize=14, fontweight='bold')
plt.xlabel('Diagnosis (M=Malignant, B=Benign)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 4. Data Cleaning - Check for Missing and Inconsistent Values


In [None]:
# Check for missing values
print("Missing Values Check:")
missing_values = df.isnull().sum()
print(f"\nTotal missing values: {missing_values.sum()}")

if missing_values.sum() > 0:
    print("\nColumns with missing values:")
    print(missing_values[missing_values > 0])
else:
    print("\n✓ No missing values found!")


In [None]:
# Check for duplicate rows
print("Duplicate Rows Check:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"\nRemoving {duplicates} duplicate rows...")
    df = df.drop_duplicates()
    print(f"New shape: {df.shape}")
else:
    print("✓ No duplicate rows found!")


In [None]:
# Check for inconsistent values (negative values, outliers)
print("Checking for negative values (which shouldn't exist in this dataset):")
negative_counts = (df[feature_names] < 0).sum()

if negative_counts.sum() > 0:
    print("\nColumns with negative values:")
    print(negative_counts[negative_counts > 0])
else:
    print("✓ No negative values found!")


In [None]:
# Check data types
print("Data Types:")
print(df.dtypes.value_counts())
print("\n✓ All feature columns are numeric (float64)")


## 5. Feature Correlation Analysis


In [None]:
# Correlation matrix for a subset of features (to keep visualization readable)
# Select mean features only
mean_features = [col for col in feature_names if 'mean' in col]

# Create a numeric version of diagnosis for correlation
df['diagnosis_numeric'] = df['diagnosis'].map({'M': 1, 'B': 0})

plt.figure(figsize=(12, 10))
correlation_matrix = df[mean_features + ['diagnosis_numeric']].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix - Mean Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 6. Encode Target Labels

Convert diagnosis labels from categorical (M/B) to numeric:
- M (Malignant) → 1
- B (Benign) → 0


In [None]:
# Create label encoder
label_encoder = LabelEncoder()

# Encode the diagnosis column
# M (Malignant) = 1, B (Benign) = 0
df['diagnosis_encoded'] = label_encoder.fit_transform(df['diagnosis'])

# Verify encoding
print("Label Encoding Verification:")
print(df[['diagnosis', 'diagnosis_encoded']].drop_duplicates().sort_values('diagnosis'))
print("\n✓ Labels encoded successfully!")
print("  M (Malignant) → 1")
print("  B (Benign) → 0")


## 7. Prepare Features and Target


In [None]:
# Separate features and target
X = df[feature_names]  # All 30 features
y = df['diagnosis_encoded']  # Encoded target (0=Benign, 1=Malignant)

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nFeature names ({len(feature_names)} total):")
for i, feature in enumerate(feature_names, 1):
    print(f"{i:2d}. {feature}")


## 8. Split Dataset into Training and Testing Sets


In [None]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print("Dataset Split:")
print(f"Training set size: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set size: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print("\nClass distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in testing set:")
print(y_test.value_counts())


## 9. Standardize Features using StandardScaler

Standardization transforms features to have mean=0 and standard deviation=1


In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit on training data and transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for better visualization
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

print("✓ Features standardized successfully!")
print(f"\nScaled training data shape: {X_train_scaled.shape}")
print(f"Scaled testing data shape: {X_test_scaled.shape}")


In [None]:
# Verify standardization (mean ≈ 0, std ≈ 1)
print("Verification of Standardization (Training Set):")
print(f"Mean of scaled features: {X_train_scaled.mean(axis=0).mean():.6f} (should be ≈ 0)")
print(f"Std of scaled features: {X_train_scaled.std(axis=0).mean():.6f} (should be ≈ 1)")

# Show statistics for first few features
print("\nStatistics for first 5 features (before and after scaling):")
print("\nBEFORE Scaling:")
print(X_train.iloc[:, :5].describe().loc[['mean', 'std']])
print("\nAFTER Scaling:")
print(X_train_scaled_df.iloc[:, :5].describe().loc[['mean', 'std']])


## 10. Visualize Feature Distributions (Before vs After Scaling)


In [None]:
# Compare distributions before and after scaling for a few features
features_to_plot = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
fig.suptitle('Feature Distributions: Before vs After Standardization', fontsize=16, fontweight='bold')

for idx, feature in enumerate(features_to_plot):
    # Before scaling
    axes[0, idx].hist(X_train[feature], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[0, idx].set_title(f'{feature}\n(Before)', fontsize=10)
    axes[0, idx].set_ylabel('Frequency')
    axes[0, idx].grid(alpha=0.3)
    
    # After scaling
    axes[1, idx].hist(X_train_scaled_df[feature], bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
    axes[1, idx].set_title(f'{feature}\n(After)', fontsize=10)
    axes[1, idx].set_xlabel('Value')
    axes[1, idx].set_ylabel('Frequency')
    axes[1, idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()


## 11. Save Preprocessed Data


In [None]:
# Save preprocessed data to CSV files
# Create data directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Save scaled training data
X_train_scaled_df.to_csv('data/processed/X_train_scaled.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False, header=True)

# Save scaled testing data
X_test_scaled_df.to_csv('data/processed/X_test_scaled.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False, header=True)

# Save the scaler for future use
joblib.dump(scaler, 'data/processed/scaler.pkl')

# Also save feature names for reference
with open('data/processed/feature_names.txt', 'w') as f:
    for feature in feature_names:
        f.write(f"{feature}\n")

print("✓ Preprocessed data saved successfully!")
print("\nSaved files:")
print("  - data/processed/X_train_scaled.csv")
print("  - data/processed/y_train.csv")
print("  - data/processed/X_test_scaled.csv")
print("  - data/processed/y_test.csv")
print("  - data/processed/scaler.pkl")
print("  - data/processed/feature_names.txt")


## 12. Summary of Preprocessing Steps


In [None]:
print("="*60)
print("PREPROCESSING SUMMARY")
print("="*60)
print(f"\n1. Dataset Loaded:")
print(f"   - Source: uci_breast_cancer_dataset/wdbc.data")
print(f"   - Total samples: {len(df)}")
print(f"   - Total features: {len(feature_names)}")
print(f"   - Malignant cases: {(df['diagnosis'] == 'M').sum()}")
print(f"   - Benign cases: {(df['diagnosis'] == 'B').sum()}")

print(f"\n2. Data Cleaning:")
print(f"   - Missing values: 0")
print(f"   - Duplicate rows: 0")
print(f"   - Negative values: 0")

print(f"\n3. Label Encoding:")
print(f"   - M (Malignant) → 1")
print(f"   - B (Benign) → 0")

print(f"\n4. Train-Test Split:")
print(f"   - Training samples: {len(X_train)} (80%)")
print(f"   - Testing samples: {len(X_test)} (20%)")

print(f"\n5. Feature Standardization:")
print(f"   - Method: StandardScaler")
print(f"   - Mean ≈ 0, Std ≈ 1")

print(f"\n6. Data Saved:")
print(f"   - Location: data/processed/")
print(f"   - Files: CSV format + scaler pickle")

print("\n" + "="*60)
print("✓ PREPROCESSING COMPLETE - Ready for Model Training!")
print("="*60)
