# Assignment 2 — Logistic Regression with Threshold Tuning (Breast Cancer)
*Prepared:* 2025-10-11

**Goal:** Binary classification with Logistic Regression; analyze threshold trade-offs via ROC and PR curves.

**Dataset:** `sklearn.datasets.load_breast_cancer()`

In [None]:
# Setup
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
plt.rcParams['figure.figsize'] = (7,4)

In [None]:
# Load dataset into a DataFrame
data = load_breast_cancer()
X, y = data.data, data.target
df = pd.DataFrame(X, columns=data.feature_names).assign(target=y)
display(df.head()); display(df['target'].value_counts())

In [None]:
# Class balance bar chart
vc = df['target'].value_counts().sort_index()
plt.bar(['malignant(0)','benign(1)'], vc.values)
plt.title('Class Balance'); plt.show()

In [None]:
# Top 10 correlated features with target
corr = df.corr(numeric_only=True)['target'].drop('target').abs().sort_values(ascending=False).head(10)
corr.plot(kind='barh'); plt.title('Top 10 |corr(feature,target)|'); plt.show()

In [None]:
# Train/test split + pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
pipe = Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])
pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:,1]
pred_default = (proba >= 0.5).astype(int)

def report(y_true, y_pred, name='Model'):
    print(name)
    print('Acc:', accuracy_score(y_true, y_pred),
          'Prec:', precision_score(y_true, y_pred),
          'Rec:', recall_score(y_true, y_pred),
          'F1:', f1_score(y_true, y_pred))
    print('Confusion matrix:\n', confusion_matrix(y_true, y_pred))

report(y_test, pred_default, 'LogReg @0.50')

In [None]:
# ROC and PR curves
fpr, tpr, _ = roc_curve(y_test, proba)
auc = roc_auc_score(y_test, proba)
plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
plt.title(f'ROC (AUC={auc:.3f})'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.show()

prec, rec, th = precision_recall_curve(y_test, proba)
plt.plot(rec, prec)
plt.title('Precision-Recall Curve'); plt.xlabel('Recall'); plt.ylabel('Precision'); plt.show()

In [None]:
# Threshold tuning — choose two thresholds
for thr, label in [(0.30,'High Recall (0.30)'), (0.70,'High Precision (0.70)')]:
    pred = (proba >= thr).astype(int)
    report(y_test, pred, f'LogReg @{thr:.2f}')

In [None]:
# Show top ±10 coefficients (by absolute value)
lr = pipe.named_steps['logreg']
coef = pd.Series(lr.coef_.ravel(), index=data.feature_names).sort_values()
top = pd.concat([coef.head(10), coef.tail(10)])
top.plot(kind='barh'); plt.title('Top ±10 Coefficients'); plt.show()

**TODOs:**
- Write 5 bullets deciding which threshold you’d use in screening vs diagnostics.
- Interpret two strong features in plain language.