# Principal Component Analysis (PCA)

This notebook covers:
- Applying PCA for dimensionality reduction
- Determining optimal number of components
- Visualizing PCA results
- Analyzing explained variance


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the preprocessed data
X_scaled = pd.read_csv('data/X_scaled.csv')
y = pd.read_csv('data/y_target.csv').values.ravel()

print("Data loaded successfully!")
print(f"Features shape: {X_scaled.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution: {np.bincount(y)}")


In [None]:
# Apply PCA
print("Applying Principal Component Analysis:")
print("=" * 40)

# Fit PCA with all components first to analyze explained variance
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_scaled)

# Calculate explained variance ratio
explained_variance_ratio = pca_full.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

print("Explained Variance Ratio for each component:")
for i, (var_ratio, cum_var) in enumerate(zip(explained_variance_ratio, cumulative_variance_ratio)):
    print(f"PC{i+1:2d}: {var_ratio:.4f} (Cumulative: {cum_var:.4f})")

# Find optimal number of components (retain 95% variance)
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"\nNumber of components to retain 95% variance: {n_components_95}")

# Find optimal number of components (retain 90% variance)
n_components_90 = np.argmax(cumulative_variance_ratio >= 0.90) + 1
print(f"Number of components to retain 90% variance: {n_components_90}")


In [None]:
# Visualize PCA results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Explained Variance Ratio
axes[0, 0].bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('Explained Variance Ratio by Component')
axes[0, 0].grid(True, alpha=0.3)

# 2. Cumulative Explained Variance
axes[0, 1].plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
axes[0, 1].axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
axes[0, 1].axhline(y=0.90, color='g', linestyle='--', label='90% Variance')
axes[0, 1].set_xlabel('Number of Components')
axes[0, 1].set_ylabel('Cumulative Explained Variance Ratio')
axes[0, 1].set_title('Cumulative Explained Variance')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. PCA Scatter Plot (First 2 components)
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

scatter = axes[1, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y, cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.3f})')
axes[1, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.3f})')
axes[1, 0].set_title('PCA: First 2 Components')
plt.colorbar(scatter, ax=axes[1, 0], label='Heart Disease')

# 4. Component Loadings (First 2 components)
loadings = pca_2d.components_.T
feature_names = X_scaled.columns

axes[1, 1].scatter(loadings[:, 0], loadings[:, 1])
for i, feature in enumerate(feature_names):
    axes[1, 1].annotate(feature, (loadings[i, 0], loadings[i, 1]))
axes[1, 1].set_xlabel(f'PC1 Loadings')
axes[1, 1].set_ylabel(f'PC2 Loadings')
axes[1, 1].set_title('Feature Loadings on First 2 Components')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Apply PCA with optimal number of components
print("Applying PCA with optimal number of components:")
print("=" * 50)

# Use 90% variance retention as a good balance
n_components_optimal = n_components_90
print(f"Using {n_components_optimal} components to retain 90% variance")

# Apply PCA with optimal components
pca_optimal = PCA(n_components=n_components_optimal)
X_pca_optimal = pca_optimal.fit_transform(X_scaled)

print(f"Original shape: {X_scaled.shape}")
print(f"PCA shape: {X_pca_optimal.shape}")
print(f"Variance retained: {pca_optimal.explained_variance_ratio_.sum():.4f}")

# Save PCA transformed data
X_pca_df = pd.DataFrame(X_pca_optimal, columns=[f'PC{i+1}' for i in range(n_components_optimal)])
X_pca_df.to_csv('data/X_pca.csv', index=False)

print(f"\nPCA transformed data saved to 'data/X_pca.csv'")
print(f"Components: {list(X_pca_df.columns)}")
print(f"Explained variance per component: {pca_optimal.explained_variance_ratio_}")

# Display first few rows of PCA data
print("\nFirst 5 rows of PCA transformed data:")
print(X_pca_df.head())
