
# Breast Cancer Risk Analysis using Machine Learning

This notebook demonstrates how to load and preprocess breast‑cancer risk data, train machine‑learning models to predict five‑year breast‑cancer risk using non‑invasive health variables, evaluate performance, assess fairness across subgroups, and interpret model predictions using SHAP.

Adjust the `data_path` variable in the first code cell to point to your dataset (CSV file). If no path is provided, a synthetic dataset will be generated for demonstration purposes.


In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, brier_score_loss, confusion_matrix, roc_curve, precision_recall_curve
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# Set path to CSV file; leave as None to use synthetic data
data_path = None  # e.g., 'breast_cancer_data.csv'


In [None]:

# Synthetic data generation (same function as in the python script)

def generate_synthetic_data(n_samples=100000, seed=42):
    rng = np.random.default_rng(seed)
    age_group = rng.integers(1, 14, size=n_samples)
    age_midpoints = {1:23.5,2:32,3:37,4:42,5:47,6:52,7:57,8:62,9:67,10:72,11:77,12:82,13:87}
    age = np.array([age_midpoints[i] for i in age_group])
    bmi_group = rng.choice([1,2,3,4], size=n_samples, p=[0.35,0.35,0.2,0.1])
    bmi_values = {1: rng.normal(21,2,n_samples),2:rng.normal(27,1.5,n_samples),3:rng.normal(32,1.5,n_samples),4:rng.normal(37,1.5,n_samples)}
    bmi = np.array([bmi_values[i][j] for j,i in enumerate(bmi_group)])
    race_eth = rng.choice([1,2,3,4,5,6], size=n_samples, p=[0.65,0.15,0.05,0.01,0.1,0.04])
    first_degree_hx = rng.choice([0,1], size=n_samples, p=[0.85,0.15])
    age_menarche = rng.choice([0,1,2], size=n_samples, p=[0.1,0.6,0.3])
    age_first_birth = rng.choice([0,1,2,3,4], size=n_samples, p=[0.2,0.25,0.3,0.15,0.1])
    BIRADS_breast_density = rng.choice([1,2,3,4], size=n_samples, p=[0.2,0.4,0.25,0.15])
    current_hrt = rng.choice([0,1], size=n_samples, p=[0.7,0.3])
    menopaus = rng.choice([1,2,3], size=n_samples, p=[0.5,0.4,0.1])
    biophx = rng.choice([0,1], size=n_samples, p=[0.9,0.1])
    breast_cancer_history = rng.choice([0,1], size=n_samples, p=[0.95,0.05])
    year = rng.integers(2005, 2018, size=n_samples)
    log_odds = (
        (age - 50) * 0.05 + bmi * 0.08 + first_degree_hx*1.2 +
        (age_menarche == 2) * 0.3 +
        (age_first_birth == 3) * 0.5 +
        (age_first_birth == 4) * 0.6 +
        (BIRADS_breast_density == 4) * 0.7 +
        (BIRADS_breast_density == 3) * 0.4 +
        current_hrt * 0.5 +
        biophx * 0.4 +
        breast_cancer_history * 2.0
    )
    probs = 1/(1+np.exp(-log_odds))
    cancer_5yr = rng.binomial(1, probs)
    df = pd.DataFrame({
        'year': year,
        'age_group_5_years': age_group,
        'race_eth': race_eth,
        'first_degree_hx': first_degree_hx,
        'age_menarche': age_menarche,
        'age_first_birth': age_first_birth,
        'BIRADS_breast_density': BIRADS_breast_density,
        'current_hrt': current_hrt,
        'menopaus': menopaus,
        'bmi_group': bmi_group,
        'biophx': biophx,
        'breast_cancer_history': breast_cancer_history,
        'age': age,
        'BMI': bmi,
        'cancer_5yr': cancer_5yr
    })
    return df

# Load or generate data
if data_path:
    data = pd.read_csv(data_path)
else:
    data = generate_synthetic_data()

data.head()


In [None]:

# Separate features and target
X = data.copy()
y = X.pop('cancer_5yr')

# Identify numerical and categorical features
numeric_features = ['year', 'age', 'BMI']
categorical_features = [
    'age_group_5_years', 'race_eth', 'first_degree_hx', 'age_menarche',
    'age_first_birth', 'BIRADS_breast_density', 'current_hrt', 'menopaus',
    'bmi_group', 'biophx', 'breast_cancer_history'
]

# Preprocessor: one-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

# Split into train and test sets (stratified by target)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.shape, X_test.shape


In [None]:

# Function to train models
lr_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced', n_jobs=-1))
])
rf_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=500, class_weight='balanced', n_jobs=-1, random_state=42))
])
xgb_clf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', xgb.XGBClassifier(
        n_estimators=300, max_depth=4, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
        eval_metric='logloss', use_label_encoder=False, scale_pos_weight=(1/(y_train.mean())), n_jobs=-1, random_state=42
    ))
])

# Fit models
lr_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

models = {
    'Logistic Regression': lr_clf,
    'Random Forest': rf_clf,
    'Gradient Boosting': xgb_clf
}


In [None]:

# Evaluate models
performance_records = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    brier = brier_score_loss(y_test, y_proba)
    performance_records.append({
        'Model': name, 'Accuracy': accuracy, 'Precision': precision,
        'Recall': recall, 'F1': f1, 'AUC': auc, 'Brier': brier
    })

perf_df = pd.DataFrame(performance_records)
perf_df


In [None]:

# Plot ROC curves
plt.figure(figsize=(8,6))
for name, model in models.items():
    y_proba = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_proba):.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()


In [None]:

# Compute fairness metrics: false positive and false negative rates by group

def compute_group_rates(model, X_test, y_test, group_col):
    X_tmp = X_test.copy()
    y_pred = model.predict(X_test)
    df_res = pd.DataFrame({group_col: X_tmp[group_col], 'y_true': y_test, 'y_pred': y_pred})
    results = []
    for group, subset in df_res.groupby(group_col):
        tn, fp, fn, tp = confusion_matrix(subset['y_true'], subset['y_pred']).ravel()
        fpr = fp/(fp+tn) if (fp+tn)>0 else 0
        fnr = fn/(fn+tp) if (fn+tp)>0 else 0
        results.append({group_col: group, 'FPR': fpr, 'FNR': fnr, 'Count': len(subset)})
    return pd.DataFrame(results)

fairness_age = compute_group_rates(models['Gradient Boosting'], X_test, y_test, 'age_group_5_years')
fairness_race = compute_group_rates(models['Gradient Boosting'], X_test, y_test, 'race_eth')

fairness_age, fairness_race


In [None]:

# Explain gradient boosting model using SHAP (sample 200 instances)
explainer = shap.Explainer(models['Gradient Boosting'].named_steps['clf'])
# Transform data to model input
X_transformed = models['Gradient Boosting'].named_steps['preprocess'].transform(X_test)
# Choose a sample for explanation
sample_indices = np.random.choice(len(X_test), size=min(200, len(X_test)), replace=False)
X_sample = X_transformed[sample_indices]
shap_values = explainer(X_sample)

# Summary plot
shap.summary_plot(shap_values, features=X_sample, feature_names=explainer.feature_names)
