In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
df = pd.read_csv(url, header=None)

# Define features and target
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].map({'M': 1, 'B': 0}).values  # Convert labels to binary (M = 1, B = 0)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train decision tree on original data
clf_original = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
clf_original.fit(X_train, y_train)
y_pred_original = clf_original.predict(X_test)

# Compute evaluation metrics for original data
precision_original = precision_score(y_test, y_pred_original)
recall_original = recall_score(y_test, y_pred_original)
f1_original = f1_score(y_test, y_pred_original)

# Apply PCA (1 component)
pca = PCA(n_components=1)
X_train_pca1 = pca.fit_transform(X_train)
X_test_pca1 = pca.transform(X_test)

# Train decision tree on first principal component
clf_pca1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
clf_pca1.fit(X_train_pca1, y_train)
y_pred_pca1 = clf_pca1.predict(X_test_pca1)

# Compute evaluation metrics for PCA (1 component)
precision_pca1 = precision_score(y_test, y_pred_pca1)
recall_pca1 = recall_score(y_test, y_pred_pca1)
f1_pca1 = f1_score(y_test, y_pred_pca1)

# Apply PCA (2 components)
pca = PCA(n_components=2)
X_train_pca2 = pca.fit_transform(X_train)
X_test_pca2 = pca.transform(X_test)

# Train decision tree on first two principal components
clf_pca2 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
clf_pca2.fit(X_train_pca2, y_train)
y_pred_pca2 = clf_pca2.predict(X_test_pca2)

# Compute evaluation metrics for PCA (2 components)
precision_pca2 = precision_score(y_test, y_pred_pca2)
recall_pca2 = recall_score(y_test, y_pred_pca2)
f1_pca2 = f1_score(y_test, y_pred_pca2)

# Compute confusion matrix for PCA (1 component)
cm_pca1 = confusion_matrix(y_test, y_pred_pca1)
TP_pca1 = cm_pca1[1, 1]
FP_pca1 = cm_pca1[0, 1]
FN_pca1 = cm_pca1[1, 0]
TN_pca1 = cm_pca1[0, 0]

FPR_pca1 = FP_pca1 / (FP_pca1 + TN_pca1)
TPR_pca1 = TP_pca1 / (TP_pca1 + FN_pca1)

# Print results
print("Original Data Performance:")
print(f"Precision: {precision_original:.4f}, Recall: {recall_original:.4f}, F1 Score: {f1_original:.4f}\n")

print("PCA (1 Component) Performance:")
print(f"Precision: {precision_pca1:.4f}, Recall: {recall_pca1:.4f}, F1 Score: {f1_pca1:.4f}")
print(f"False Positives: {FP_pca1}, True Positives: {TP_pca1}")
print(f"False Positive Rate: {FPR_pca1:.4f}, True Positive Rate: {TPR_pca1:.4f}\n")

print("PCA (2 Components) Performance:")
print(f"Precision: {precision_pca2:.4f}, Recall: {recall_pca2:.4f}, F1 Score: {f1_pca2:.4f}")


Original Data Performance:
Precision: 0.9048, Recall: 0.9048, F1 Score: 0.9048

PCA (1 Component) Performance:
Precision: 0.8923, Recall: 0.9206, F1 Score: 0.9062
False Positives: 7, True Positives: 58
False Positive Rate: 0.0648, True Positive Rate: 0.9206

PCA (2 Components) Performance:
Precision: 0.9310, Recall: 0.8571, F1 Score: 0.8926
