In [2]:
# PCA and Logistic Regression on Breast Cancer Dataset

# Import Required Libraries

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Load the Cancer Dataset

cancer = load_breast_cancer()

X = cancer.data
y = cancer.target
feature_names = cancer.feature_names

# Convert to DataFrame for clarity
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y


# Standardize the Features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Apply PCA (2 Components)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create PCA DataFrame
pca_df = pd.DataFrame(
    data=X_pca,
    columns=['Principal Component 1', 'Principal Component 2']
)

# Explained variance
explained_variance = pca.explained_variance_ratio_

print("Explained Variance Ratio:")
print(f"PC1: {explained_variance[0]:.4f}")
print(f"PC2: {explained_variance[1]:.4f}")
print(f"Total Variance Explained: {explained_variance.sum():.4f}")


# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X_pca,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


# Logistic Regression Model

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)


# Model Evaluation

accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", round(accuracy, 4))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))


# PCA Loadings (Feature Importance)

loadings = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=feature_names
)

print("\nTop variables contributing to PC1:")
print(loadings['PC1'].abs().sort_values(ascending=False).head(5))

print("\nTop variables contributing to PC2:")
print(loadings['PC2'].abs().sort_values(ascending=False).head(5))


Explained Variance Ratio:
PC1: 0.4427
PC2: 0.1897
Total Variance Explained: 0.6324

Model Accuracy: 0.9591

Classification Report:
              precision    recall  f1-score   support

   malignant       0.95      0.94      0.94        64
      benign       0.96      0.97      0.97       107

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171


Top variables contributing to PC1:
mean concave points     0.260854
mean concavity          0.258400
worst concave points    0.250886
mean compactness        0.239285
worst perimeter         0.236640
Name: PC1, dtype: float64

Top variables contributing to PC2:
mean fractal dimension     0.366575
fractal dimension error    0.280092
worst fractal dimension    0.275339
mean radius                0.233857
compactness error          0.232716
Name: PC2, dtype: float64
