
# Early Breast Cancer Prediction — Final Notebook (Clean & Concise)

**Updated:** 2025-08-08 21:05

Streamlined, high-level notebook using a 10% dataset sample.  
Run on the full dataset later for final results.

**Sections**
1. Configuration
2. Load & Inspect
3. Missing Values
4. Exploratory Charts
5. Train/Test Split
6. Preprocessing
7. Class Imbalance (SMOTE)
8. Baseline Models + Cross‑Validation
9. Test‑Set Evaluation
10. (Optional) Hyperparameter Tuning


## 1. Configuration

In [None]:

DATA_PATH = "/mnt/data/sample_10percent.csv"   # change to full dataset later
TARGET_COL = "cancer"                          # set your label column
TEST_SIZE = 0.2
CV_FOLDS = 5
RANDOM_STATE = 42
N_JOBS = -1

import warnings; warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             roc_curve, precision_recall_curve, average_precision_score, ConfusionMatrixDisplay)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except:
    XGB_AVAILABLE = False

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)
print("Configuration ready.")

## 2. Load & Inspect

In [None]:

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
display(df.head(5))
display(df.tail(3))

_ = df.info()
display(pd.DataFrame(df.dtypes, columns=["dtype"]).T)

display(df.describe(include='number').T.head(20))
display(df.describe(include='object').T.head(20))

assert TARGET_COL in df.columns, f"TARGET_COL '{TARGET_COL}' not found. Please set it in Section 1."
print("Target column:", TARGET_COL)

## 3. Missing Values

In [None]:

missing = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(2)
missing_tbl = pd.DataFrame({"missing_count": missing, "missing_pct": missing_pct})
display(missing_tbl.head(30))

print("Total NaNs:", int(df.isna().sum().sum()))
print("Any fully empty columns?:", bool((missing == len(df)).any()))

## 4. Exploratory Charts

In [None]:

NUM_SHOW = 6
CAT_SHOW = 6

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for col in num_cols[:NUM_SHOW]:
    plt.figure()
    df[col].hist(bins=40)
    plt.title(f"Distribution: {col}"); plt.xlabel(col); plt.ylabel("count")
    plt.show()

for col in cat_cols[:CAT_SHOW]:
    plt.figure()
    df[col].astype(str).value_counts().head(15).plot(kind='bar')
    plt.title(f"Top categories: {col}"); plt.xlabel(col); plt.ylabel("count")
    plt.xticks(rotation=45, ha='right'); plt.tight_layout()
    plt.show()

subset = num_cols[:12]
if len(subset) >= 2:
    corr = df[subset].corr()
    plt.figure(figsize=(7,5))
    im = plt.imshow(corr, aspect='auto')
    plt.colorbar(im)
    plt.title("Correlation heatmap (subset)")
    plt.xticks(range(len(subset)), subset, rotation=90); plt.yticks(range(len(subset)), subset)
    plt.tight_layout(); plt.show()

## 5. Train/Test Split

In [None]:

ID_COLUMNS = []

X = df.drop(columns=[TARGET_COL] + [c for c in ID_COLUMNS if c in df.columns], errors='ignore')
y = df[TARGET_COL]

print("NaNs in X:", int(X.isna().sum().sum()), "| NaNs in y:", int(y.isna().sum()))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print("Train:", X_train.shape, "| Test:", X_test.shape)

print("Class balance (train) %:")
display((y_train.value_counts(normalize=True)*100).round(2))

## 6. Preprocessing

In [None]:

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                         ("scaler", StandardScaler())])

categorical_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                             ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

preprocessor = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])

Xt_train = preprocessor.fit_transform(X_train)
Xt_test = preprocessor.transform(X_test)
print("Shapes:", Xt_train.shape, Xt_test.shape)
print("NaNs after preprocess (train)?", np.isnan(Xt_train).any())
print("NaNs after preprocess (test)?", np.isnan(Xt_test).any())

## 7. Class Imbalance

In [None]:

print("Class distribution before SMOTE (train) %:")
display((y_train.value_counts(normalize=True)*100).round(2))


## 8. Baseline Models + Cross‑Validation

In [None]:

scorer = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "average_precision": "average_precision"
}

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

pipe_logreg = ImbPipeline([("prep", preprocessor),
                           ("smote", SMOTE(random_state=RANDOM_STATE)),
                           ("clf", LogisticRegression(max_iter=1000, n_jobs=N_JOBS))])

pipe_rf = ImbPipeline([("prep", preprocessor),
                       ("smote", SMOTE(random_state=RANDOM_STATE)),
                       ("clf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=N_JOBS))])

models = {"LogReg": pipe_logreg, "RF": pipe_rf}

if XGB_AVAILABLE:
    pipe_xgb = ImbPipeline([("prep", preprocessor),
                            ("smote", SMOTE(random_state=RANDOM_STATE)),
                            ("clf", XGBClassifier(
                                n_estimators=400, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
                                max_depth=6, random_state=RANDOM_STATE, n_jobs=N_JOBS, eval_metric="logloss"))])
    models["XGB"] = pipe_xgb

cv_results = {}
for name, pipe in models.items():
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scorer, n_jobs=N_JOBS)
    cv_results[name] = {k: float(np.mean(v)) for k, v in scores.items()}
    print(name, "→", cv_results[name])

cv_df = pd.DataFrame(cv_results).T.sort_values("test_roc_auc", ascending=False)
display(cv_df)

best_name = cv_df.index[0]
print("Best model by CV ROC‑AUC:", best_name)

## 9. Test‑Set Evaluation

In [None]:

best_pipe = models[best_name]
best_pipe.fit(X_train, y_train)

proba = best_pipe.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, zero_division=0))
print("Recall:", recall_score(y_test, pred, zero_division=0))
print("F1:", f1_score(y_test, pred, zero_division=0))
print("ROC‑AUC:", roc_auc_score(y_test, proba))
print("PR‑AUC:", average_precision_score(y_test, proba))

plt.figure()
ConfusionMatrixDisplay.from_predictions(y_test, pred)
plt.title(f"Confusion Matrix — {best_name}"); plt.show()

fpr, tpr, _ = roc_curve(y_test, proba)
plt.figure(); plt.plot(fpr, tpr, label="ROC"); plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC — {best_name}"); plt.legend(); plt.show()

prec, rec, _ = precision_recall_curve(y_test, proba)
plt.figure(); plt.plot(rec, prec, label="PR")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR — {best_name}"); plt.legend(); plt.show()

## 10. (Optional) Hyperparameter Tuning

In [None]:

RUN_TUNING = False

if RUN_TUNING:
    import scipy.stats as st
    if best_name == "RF":
        dist = {"clf__n_estimators": st.randint(300, 900),
                "clf__max_depth": st.randint(3, 20),
                "clf__min_samples_split": st.randint(2, 20),
                "clf__min_samples_leaf": st.randint(1, 20),
                "clf__max_features": ["sqrt", "log2", None]}
    elif best_name == "LogReg":
        dist = {"clf__C": st.loguniform(1e-3, 1e2),
                "clf__solver": ["lbfgs", "liblinear"],
                "clf__penalty": ["l2"]}
    elif best_name == "XGB" and XGB_AVAILABLE:
        dist = {"clf__n_estimators": st.randint(300, 900),
                "clf__max_depth": st.randint(3, 12),
                "clf__learning_rate": st.uniform(0.01, 0.2),
                "clf__subsample": st.uniform(0.6, 0.4),
                "clf__colsample_bytree": st.uniform(0.6, 0.4),
                "clf__gamma": st.uniform(0.0, 5.0),
                "clf__reg_alpha": st.uniform(0.0, 1.0),
                "clf__reg_lambda": st.uniform(0.5, 1.5)}
    else:
        dist = None

    if dist:
        tuner = RandomizedSearchCV(models[best_name], param_distributions=dist,
                                   n_iter=25, scoring="roc_auc", cv=StratifiedKFold(n_splits=CV_FOLDS,
                                   shuffle=True, random_state=RANDOM_STATE),
                                   random_state=RANDOM_STATE, n_jobs=N_JOBS, verbose=1)
        tuner.fit(X_train, y_train)
        print("Best params:", tuner.best_params_)
        print("Best CV ROC‑AUC:", tuner.best_score_)