In [1]:
# Breast Cancer Wisconsin (Diagnostic) - Initial Data Inspection
# --------------------------------------------------------------
# Step 1: Load libraries and dataset

import numpy as np
import pandas as pd
import os

os.makedirs("figures", exist_ok=True)

file_path = "/Users/jeremygoetschy/Projects/Breast_Cancer/Raw/data.csv"
df = pd.read_csv(file_path)

# Step 2: Dataset dimensions
print(df.shape)
# 🔎 Observations:
# - The dataset contains 569 rows and 33 columns.

# Step 3: Dataset info (dtypes, nulls)
print(f"\ndf.info():\n{df.info()}")
# 🔎 Observations:
# - 1 categorical column: "diagnosis" (target: M = malignant, B = benign).
# - 30 numerical features (various tumor measurements).
# - "id" is a unique identifier, "Unnamed: 32" is empty → both irrelevant.

# Step 4: Preview dataset
print(f"\ndf.head():\n{df.head()}")
print(f"\ndf.tail():\n{df.tail()}")
# 🔎 Observations:
# - Features like "radius_mean", "texture_mean", "area_worst" describe tumor characteristics.
# - Target column "diagnosis" separates malignant vs. benign tumors.

# Step 5: Descriptive statistics
print(f"\ndf.describe():\n{df.describe()}")
# 🔎 Observations:
# - Features are on very different scales (e.g., area vs. smoothness) → scaling required later.
# - Some distributions show skewness → possible log transforms.

# Step 6: Missing values check
print(f"\ndf.isnull().sum():\n{df.isnull().sum()}")
# 🔎 Observations:
# - No missing values detected.

# Step 7: Duplicate check
print(f"\ndf.duplicated().sum():\n{df.duplicated().sum()}")
# 🔎 Observations:
# - No duplicated rows.

# Step 8: ID uniqueness
print(df["id"].nunique() == len(df))
# 🔎 Observations:
# - "id" column is unique, but irrelevant for modeling.

# Step 9: Drop irrelevant columns
df = df.drop(columns=["id", "Unnamed: 32"], axis=1)
print("\nFinal dataset shape after cleaning:", df.shape)
# 🔎 Observations:
# - Dataset now reduced to 31 columns (30 features + target).

(569, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se    

In [2]:
# Breast Cancer Dataset - Exploratory Data Analysis (EDA)
# -------------------------------------------------------
# Goal: Explore feature distributions, relationships with target,
#       correlations, and multicollinearity.

import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

sns.set(style="whitegrid")


# ============================
# 1. Target Variable
# ============================
df["diagnosis"] = df["diagnosis"].map({"M": 1, "B": 0})  # Malignant=1, Benign=0

plt.figure(figsize=(8, 6))
sns.countplot(x="diagnosis", data=df)
plt.title("Target Distribution")
plt.xlabel("Diagnosis (0: Benign, 1: Malignant)")
plt.ylabel("Count")
plt.savefig("figures/target_distribution.png")
plt.close()

# 🔎 Observation:
# - Target is imbalanced → requires StratifiedKFold cross-validation.
# - F1 Score & ROC-AUC are better evaluation metrics than Accuracy.


# ============================
# 2. Univariate Analysis
# ============================
num_features = len(df.columns) - 1  # exclude target
feature_cols = [col for col in df.columns if col != "diagnosis"]

fig, axes = plt.subplots(num_features, 2, figsize=(14, num_features * 2.5))
for i, col in enumerate(feature_cols):
    sns.histplot(df[col], ax=axes[i, 0], kde=True)
    sns.boxplot(x=df[col], ax=axes[i, 1], orient="h")
    axes[i, 0].set_title(f"{col} - Histogram")
    axes[i, 1].set_title(f"{col} - Boxplot")
plt.tight_layout()
plt.savefig("figures/univariate_analysis.png")
plt.close()

# 🔎 Observation:
# - Many features are skewed.
# - Several outliers exist (expected in medical data).


# ============================
# 3. Bivariate Analysis (Features vs Target)
# ============================
fig, axes = plt.subplots(num_features, 1, figsize=(10, num_features * 3))
for i, col in enumerate(feature_cols):
    sns.boxplot(x="diagnosis", y=col, data=df, ax=axes[i])
    axes[i].set_title(f"{col} vs Diagnosis")
plt.tight_layout()
plt.savefig("figures/bivariate_analysis.png")
plt.close()

# 🔎 Observation:
# - Most features show significantly higher values for Malignant cases (1).
# - Exceptions: fractal_dimension_mean, texture_se, smoothness_se, symmetry_se.
# - _se features add limited information compared to mean/worst features.


# ============================
# 4. Correlation Analysis (Pearson)
# ============================
plt.figure(figsize=(20, 15))
sns.heatmap(
    df.corr(method="pearson"),
    annot=False,
    vmin=-1,
    vmax=1,
    cmap="coolwarm",
    square=True,
)
plt.title("Feature Correlation Heatmap")
plt.savefig("figures/correlation_heatmap.png")
plt.close()

print(df.corr(method="pearson")["diagnosis"].sort_values(ascending=False))

# 🔎 Top correlated features with Malignant (diagnosis=1):
# - concave points_worst (0.79)
# - perimeter_worst (0.78)
# - concave points_mean (0.77)
# - radius_worst (0.77)
# - perimeter_mean (0.74)
# - area_worst (0.73)
# - radius_mean (0.73)
# - area_mean (0.71)


# ============================
# 5. Multivariate Analysis (Top Features)
# ============================
top_features = [
    "concave points_worst",
    "perimeter_worst",
    "concave points_mean",
    "radius_worst",
    "perimeter_mean",
    "area_worst",
    "radius_mean",
    "area_mean",
]
sns.pairplot(df, vars=top_features, hue="diagnosis", palette="coolwarm")
plt.savefig("figures/pairplot_top_features.png")
plt.close()

# 🔎 Observation:
# - Clear separation between malignant vs benign for these features.
# - Diagnostic=1 (malignant) consistently shows higher values.


# ============================
# 6. Heatmap of Top Features
# ============================
plt.figure(figsize=(10, 8))
sns.heatmap(
    df[top_features + ["diagnosis"]].corr(),
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    cmap="coolwarm",
)
plt.title("Correlation of Top Features with Diagnosis")
plt.savefig("figures/top_features_correlation_heatmap.png")
plt.close()

# 🔎 Observation:
# - Very high multicollinearity among radius, perimeter, area, and concave points.


# ============================
# 7. Variance Inflation Factor (VIF)
# ============================
vif_data = pd.DataFrame()
vif_data["feature"] = feature_cols
vif_data["VIF"] = [
    variance_inflation_factor(df[feature_cols].values, i)
    for i in range(df[feature_cols].shape[1])
]
print(vif_data.sort_values("VIF", ascending=False))

# 🔎 Observation:
# - Confirms multicollinearity → dimensionality reduction or feature selection is needed.

diagnosis                  1.000000
concave points_worst       0.793566
perimeter_worst            0.782914
concave points_mean        0.776614
radius_worst               0.776454
perimeter_mean             0.742636
area_worst                 0.733825
radius_mean                0.730029
area_mean                  0.708984
concavity_mean             0.696360
concavity_worst            0.659610
compactness_mean           0.596534
compactness_worst          0.590998
radius_se                  0.567134
perimeter_se               0.556141
area_se                    0.548236
texture_worst              0.456903
smoothness_worst           0.421465
symmetry_worst             0.416294
texture_mean               0.415185
concave points_se          0.408042
smoothness_mean            0.358560
symmetry_mean              0.330499
fractal_dimension_worst    0.323872
compactness_se             0.292999
concavity_se               0.253730
fractal_dimension_se       0.077972
symmetry_se               -0

In [3]:
# Breast Cancer Dataset - Feature Engineering
# -------------------------------------------
# Step 1: Outlier Detection & Treatment using IQR method


def cap_outliers(df, feature_cols, factor=1.5):
    """
    Caps outliers for each numerical feature using the IQR method.
    Values outside [Q1 - factor*IQR, Q3 + factor*IQR] are clipped.
    """
    for col in feature_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        df[col] = df[col].clip(lower_bound, upper_bound)
    return df


df = cap_outliers(df, feature_cols)

In [4]:
# ============================================================
# Breast Cancer Classification - Model Selection & Optimization
# ------------------------------------------------------------
# Goal: Compare ML models (LogReg, RF, XGBoost, LightGBM, SVC)
#       using PCA, SMOTE, and Optuna for hyperparameter tuning.
#       Track experiments with MLflow.
# ============================================================

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE
import mlflow
import optuna

# =============================
# 1. Data Preparation
# =============================
# Features / Target
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

# Preprocessing pipeline
preprocessor = Pipeline(
    steps=[
        ("power", PowerTransformer(method="yeo-johnson")),  # Normalize skewed features
        ("scaler", StandardScaler()),  # Standardize features
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:\n", y_train.value_counts(normalize=True))
print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

# Track experiments
mlflow.set_experiment("Breast Cancer")


# =============================
# 2. Objective Function (Optuna)
# =============================
def objective(trial):
    with mlflow.start_run():
        # Choose model
        model_name = trial.suggest_categorical(
            "model_name",
            ["Logistic Regression", "Random Forest", "XGBoost", "LightGBM", "SVC"],
        )

        # Model-specific hyperparameters
        if model_name == "Logistic Regression":
            C = trial.suggest_float("C", 1e-5, 1e2, log=True)
            model = LogisticRegression(random_state=42, C=C)

        elif model_name == "Random Forest":
            n_estimators = trial.suggest_int("n_estimators", 50, 300)
            max_depth = trial.suggest_int("max_depth", 3, 20)
            model = RandomForestClassifier(
                random_state=42,
                n_estimators=n_estimators,
                max_depth=max_depth,
            )

        elif model_name == "XGBoost":
            n_estimators = trial.suggest_int("n_estimators", 50, 300)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            max_depth = trial.suggest_int("max_depth", 3, 20)
            model = XGBClassifier(
                random_state=42,
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                eval_metric="logloss",
            )

        elif model_name == "LightGBM":
            n_estimators = trial.suggest_int("n_estimators", 50, 300)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            max_depth = trial.suggest_int("max_depth", 3, 20)
            model = LGBMClassifier(
                random_state=42,
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                verbose=-1,
            )

        else:  # SVC
            C = trial.suggest_float("C", 1e-5, 1e2, log=True)
            gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
            model = SVC(random_state=42, C=C, gamma=gamma)

        # Pipeline: PCA + Classifier
        pipeline = Pipeline(
            steps=[
                ("pca", PCA(n_components=0.95)),  # Keep 95% variance
                ("classifier", model),
            ]
        )

        # Cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scoring = {
            "roc_auc": "roc_auc",
            "average_precision": "average_precision",
            "f1": "f1",
        }
        cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)

        # Log to MLflow
        mlflow.log_params(trial.params)
        mlflow.log_metrics(
            {k: float(v.mean()) for k, v in cv_results.items() if k.startswith("test_")}
        )

        return cv_results["test_roc_auc"].mean()


# =============================
# 3. Hyperparameter Optimization
# =============================
study = optuna.create_study(direction="maximize", study_name="Breast Cancer")
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_model_name = best_params.pop("model_name")
print("Best model:", best_model_name)
print("Best parameters:", best_params)


# =============================
# 4. Final Model Training
# =============================
# Instantiate best model
if best_model_name == "Logistic Regression":
    best_model = LogisticRegression(random_state=42, **best_params)
elif best_model_name == "Random Forest":
    best_model = RandomForestClassifier(random_state=42, **best_params)
elif best_model_name == "XGBoost":
    best_model = XGBClassifier(random_state=42, **best_params)
elif best_model_name == "LightGBM":
    best_model = LGBMClassifier(random_state=42, **best_params)
else:
    best_model = SVC(random_state=42, probability=True, **best_params)

# Final pipeline
final_pipeline = Pipeline(
    steps=[
        ("pca", PCA(n_components=0.95)),
        ("classifier", best_model),
    ]
)
final_pipeline.fit(X_train, y_train)

# Predictions
y_pred = final_pipeline.predict(X_test)
y_proba = final_pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\n=== Final Results ===")
print("ROC AUC: ", roc_auc_score(y_test, y_proba))
print("Average Precision: ", average_precision_score(y_test, y_proba))
print("F1 Score: ", f1_score(y_test, y_pred))


# =============================
# 5. Log Final Model to MLflow
# =============================
with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_proba))
    mlflow.log_metric("average_precision", average_precision_score(y_test, y_proba))
    mlflow.log_metric("f1", f1_score(y_test, y_pred))
    mlflow.sklearn.log_model(final_pipeline, "model")


# =============================
# 6. PCA Explained Variance Plot
# =============================
pca = final_pipeline.named_steps["pca"]
plt.plot(
    range(1, pca.n_components_ + 1), pca.explained_variance_ratio_.cumsum(), marker="o"
)
plt.title("PCA Explained Variance (95% Retained)")
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Explained Variance")
plt.grid(True)
plt.savefig("figures/pca_explained_variance.png")
plt.close()


# 🔎 Observations:
# PCA successfully reduced dimensionality from 30 features to ~10 components while preserving ~95% of the variance, mitigating multicollinearity and stabilizing model training.
# SMOTE was applied only on the training folds, ensuring balanced classes without data leakage into validation or test sets.
# Power Transformation + Standardization improved feature distributions, making both linear and non-linear models more stable.
# Across baselines, Logistic Regression and SVC already achieved strong ROC AUC (>0.99), but performance varied slightly in F1 and AP.

# 📊 Conclusion:
# Best Model: LightGBM, with optimized hyperparameters (n_estimators=157, learning_rate≈0.29, max_depth=3), achieved the highest performance (ROC AUC = 0.997, AP = 0.996, F1 = 0.963).
# Lesson Learned: For smaller tabular datasets with potential class imbalance, tree-based ensemble methods (LightGBM/XGBoost) tend to outperform linear models by capturing non-linear interactions, while PCA ensures robust dimensionality reduction.
# Key Takeaway: Combining systematic preprocessing (Power + Scaling + PCA) with SMOTE and automated hyperparameter tuning (Optuna) can deliver near-optimal results, making the pipeline both high-performing and reproducible.

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-10 14:07:23,276] A new study created in memory with name: Breast Cancer


Class distribution after SMOTE:
 diagnosis
1    0.5
0    0.5
Name: proportion, dtype: float64
Train shape: (570, 30) | Test shape: (114, 30)


[I 2025-09-10 14:07:23,641] Trial 0 finished with value: 0.9942751615881809 and parameters: {'model_name': 'Random Forest', 'n_estimators': 75, 'max_depth': 11}. Best is trial 0 with value: 0.9942751615881809.
[I 2025-09-10 14:07:24,255] Trial 1 finished with value: 0.9947060634041243 and parameters: {'model_name': 'XGBoost', 'n_estimators': 213, 'learning_rate': 0.2486054865666268, 'max_depth': 16}. Best is trial 1 with value: 0.9947060634041243.
[I 2025-09-10 14:07:24,304] Trial 2 finished with value: 0.9881194213604185 and parameters: {'model_name': 'SVC', 'C': 0.00020167351678434202, 'gamma': 'auto'}. Best is trial 1 with value: 0.9947060634041243.
[I 2025-09-10 14:07:25,086] Trial 3 finished with value: 0.9947676208064019 and parameters: {'model_name': 'Random Forest', 'n_estimators': 252, 'max_depth': 11}. Best is trial 3 with value: 0.9947676208064019.
[I 2025-09-10 14:07:25,122] Trial 4 finished with value: 0.9823945829485995 and parameters: {'model_name': 'Logistic Regression'

Best model: LightGBM
Best parameters: {'n_estimators': 131, 'learning_rate': 0.26845514262810494, 'max_depth': 5}





=== Final Results ===
ROC AUC:  0.9970238095238095
Average Precision:  0.9957983193277311
F1 Score:  0.9629629629629629


