# Breast Cancer Dataset - Exploratory Data Analysis (EDA)

This notebook contains detailed analysis of the breast cancer dataset.

## Contents:
1. Data Loading and Initial Inspection
2. Data Quality Analysis
3. Statistical Analysis
4. Visualizations
5. Correlation Analysis
6. Feature Distributions
7. Conclusions and Recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
pd.set_option('display.max_columns', None)

print("✅ Libraries loaded successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the data
data = pd.read_csv('../data/breast-cancer.csv')

print(f"📊 Data shape: {data.shape}")
print(f"📝 Number of columns: {data.shape[1]}")
print(f"📈 Number of rows: {data.shape[0]}")
print("\n📋 First 5 rows:")
data.head()

In [None]:
# Data types and basic information
print("📊 Data Types and Basic Information:")
print("=" * 50)
data.info()

print("\n📊 Column Names:")
print("=" * 30)
for i, col in enumerate(data.columns, 1):
    print(f"{i:2d}. {col}")

## 2. Data Quality Analysis

In [None]:
# Missing value analysis
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100

missing_df = pd.DataFrame({
    'Column': data.columns,
    'Missing_Values': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing_Values'] > 0].sort_values('Missing_Values', ascending=False)

if len(missing_df) > 0:
    print("⚠️ Missing Values:")
    print(missing_df)
else:
    print("✅ No missing values!")

# Target variable distribution
print("\n🎯 Target Variable (diagnosis) Distribution:")
print("=" * 40)
diagnosis_counts = data['diagnosis'].value_counts()
print(diagnosis_counts)
print(f"\nM (Malignant): {diagnosis_counts['M']} ({diagnosis_counts['M']/len(data)*100:.1f}%)")
print(f"B (Benign): {diagnosis_counts['B']} ({diagnosis_counts['B']/len(data)*100:.1f}%)")

In [None]:
# Target variable visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
colors = ['#ff9999', '#66b3ff']
wedges, texts, autotexts = axes[0].pie(diagnosis_counts.values, 
                                      labels=['Malignant (Malignant)', 'Benign (Benign)'], 
                                      autopct='%1.1f%%', 
                                      startangle=90,
                                      colors=colors,
                                      explode=(0.05, 0))

## 3. Statistical Analysis

In [None]:
# Separate numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numeric_columns:
    numeric_columns.remove('id')

print(f"📊 Number of numeric columns: {len(numeric_columns)}")
print("\n📈 Basic Statistics:")
print("=" * 50)
data[numeric_columns].describe().round(3)

In [None]:
# Group statistics by target variable
print("🎯 Statistics by Diagnosis Type:")
print("=" * 40)

# Group statistics for first 10 features
sample_features = numeric_columns[:10]
grouped_stats = data.groupby('diagnosis')[sample_features].agg(['mean', 'median', 'std']).round(3)
print(grouped_stats)

## 4. Visualizations

In [None]:
# Feature distributions (first 12 features)
fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

features_to_plot = numeric_columns[:12]

for idx, feature in enumerate(features_to_plot):
    # Separate histograms for Malignant and Benign
    malignant_data = data[data['diagnosis'] == 'M'][feature]
    benign_data = data[data['diagnosis'] == 'B'][feature]
    
    axes[idx].hist(malignant_data, alpha=0.7, label='Malignant', bins=30, color='red')
    axes[idx].hist(benign_data, alpha=0.7, label='Benign', bins=30, color='blue')
    
    axes[idx].set_title(f'{feature}', fontsize=10)
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Feature Distributions - By Diagnosis Type', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Box plot comparisons
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

important_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 
                     'area_mean', 'smoothness_mean', 'compactness_mean']

for idx, feature in enumerate(important_features):
    if feature in data.columns:
        sns.boxplot(data=data, x='diagnosis', y=feature, ax=axes[idx])
        axes[idx].set_title(f'{feature} - Box Plot')
        axes[idx].grid(True, alpha=0.3)

plt.suptitle('Box Plot Comparisons of Important Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Convert target variable to numeric
data_numeric = data.copy()
data_numeric['target'] = data_numeric['diagnosis'].map({'M': 0, 'B': 1})

# Correlation matrix
correlation_matrix = data_numeric[numeric_columns + ['target']].corr()

# Find highest correlations
target_corr = correlation_matrix['target'].abs().sort_values(ascending=False)
print("🎯 Features with highest correlation to target variable:")
print("=" * 60)
for feature, corr in target_corr.head(15).items():
    if feature != 'target':
        print(f"{feature:<25}: {corr:.4f}")

In [None]:
# Correlation matrix heatmap (top 15 features)
top_features = target_corr.head(16).index.tolist()  # including target
top_corr_matrix = correlation_matrix.loc[top_features, top_features]

plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(top_corr_matrix, dtype=bool))

sns.heatmap(top_corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', 
            center=0, square=True, fmt='.3f', cbar_kws={"shrink": .8})

plt.title('Correlation Matrix of Most Important Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# High correlation feature pairs
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.8:  # Correlation higher than 0.8
            high_corr_pairs.append({
                'Feature 1': correlation_matrix.columns[i],
                'Feature 2': correlation_matrix.columns[j],
                'Correlation': corr_value
            })

if high_corr_pairs:
    high_corr_df = pd.DataFrame(high_corr_pairs)
    high_corr_df = high_corr_df.sort_values('Correlation', key=abs, ascending=False)
    
    print(f"⚠️ High Correlation Feature Pairs (|r| > 0.8): {len(high_corr_df)}")
    print("=" * 70)
    print(high_corr_df.head(10))
else:
    print("✅ No feature pairs with correlation higher than 0.8.")

## 6. Feature Distributions - Interactive Visualization

In [None]:
# Interactive scatter plot (Plotly)
# Select top 2 most important features
top_2_features = target_corr.head(3).index.tolist()[1:3]  # excluding target, first 2

fig = px.scatter(data, x=top_2_features[0], y=top_2_features[1], 
                color='diagnosis', 
                title=f'{top_2_features[0]} vs {top_2_features[1]}',
                color_discrete_map={'M': 'red', 'B': 'blue'},
                hover_data=data.columns.tolist())

fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# 3D scatter plot
if len(target_corr) >= 4:
    top_3_features = target_corr.head(4).index.tolist()[1:4]  # excluding target, first 3
    
    fig = px.scatter_3d(data, x=top_3_features[0], y=top_3_features[1], z=top_3_features[2],
                       color='diagnosis',
                       title=f'3D Scatter Plot: {" vs ".join(top_3_features)}',
                       color_discrete_map={'M': 'red', 'B': 'blue'})
    
    fig.update_layout(width=900, height=700)
    fig.show()

## 7. Statistical Tests

In [None]:
from scipy import stats

# Test group differences with T-test
print("📊 T-Test Results (Malignant vs Benign):")
print("=" * 50)
print(f"{'Feature':<25} {'t-statistic':<15} {'p-value':<15} {'Significant':<10}")
print("-" * 65)

significant_features = []

for feature in numeric_columns[:10]:  # For first 10 features
    malignant_group = data[data['diagnosis'] == 'M'][feature]
    benign_group = data[data['diagnosis'] == 'B'][feature]
    
    t_stat, p_value = stats.ttest_ind(malignant_group, benign_group)
    
    is_significant = "Yes" if p_value < 0.05 else "No"
    if p_value < 0.05:
        significant_features.append(feature)
    
    print(f"{feature:<25} {t_stat:<15.4f} {p_value:<15.6f} {is_significant:<10}")

print(f"\n✅ Number of features with significant difference: {len(significant_features)}")

## 8. Conclusions and Recommendations

In [None]:
print("📋 EDA RESULTS AND RECOMMENDATIONS")
print("=" * 50)

print(f"\n📊 DATASET SUMMARY:")
print(f"• Total number of samples: {len(data)}")
print(f"• Number of features: {len(numeric_columns)}")
print(f"• Missing values: {'Yes' if missing_values.sum() > 0 else 'No'}")

print(f"\n🎯 TARGET VARIABLE:")
print(f"• Malignant (M): {diagnosis_counts['M']} ({diagnosis_counts['M']/len(data)*100:.1f}%)")
print(f"• Benign (B): {diagnosis_counts['B']} ({diagnosis_counts['B']/len(data)*100:.1f}%)")
print(f"• Balance status: {'Balanced' if abs(diagnosis_counts['M'] - diagnosis_counts['B']) < len(data)*0.1 else 'Imbalanced'}")

print(f"\n🔗 CORRELATION:")
print(f"• Highest correlated feature: {target_corr.index[1]} ({target_corr.iloc[1]:.4f})")
print(f"• Number of high correlation pairs: {len(high_corr_pairs) if high_corr_pairs else 0}")

print(f"\n📈 STATISTICAL SIGNIFICANCE:")
print(f"• Number of features with significant difference: {len(significant_features)}")

print(f"\n💡 RECOMMENDATIONS:")
print("• Use correlation analysis results for feature selection")
print("• Review highly correlated features again")
print("• Use stratified sampling if data imbalance exists")
print("• Prioritize most important features in machine learning models")
print("• Perform outlier analysis to check for anomalous values")

In [None]:
# Save summary statistics to file
summary_stats = {
    'dataset_shape': data.shape,
    'missing_values': missing_values.sum(),
    'target_distribution': diagnosis_counts.to_dict(),
    'top_correlated_features': target_corr.head(10).to_dict(),
    'high_correlation_pairs': len(high_corr_pairs) if high_corr_pairs else 0,
    'significant_features': significant_features
}

import json
with open('../results/eda_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2, default=str)

print("✅ EDA summary saved to '../results/eda_summary.json'!")