# 02 — PCA Analysis

- Fits PCA on preprocessed numeric space
- Plots explained variance
- Saves PCA object for downstream steps

In [None]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA

RANDOM_STATE = 42

train_path = Path("../data/processed/train.csv")
if not train_path.exists():
    raise FileNotFoundError("Run notebook 01 first to create processed data.")

preprocessor = joblib.load("../models/preprocessor.pkl")

train = pd.read_csv(train_path)
possible_targets = ["target", "num", "condition", "disease"]
target = next((t for t in possible_targets if t in train.columns), None)
if target is None:
    raise ValueError("Target not found in processed train.csv")

X_train = train.drop(columns=[target])
y_train = train[target]

X_proc = preprocessor.fit_transform(X_train)

# PCA
pca = PCA(random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_proc.toarray() if hasattr(X_proc, "toarray") else X_proc)

# Explained variance plot
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Cumulative Explained Variance")
plt.grid(True)
plt.show()

joblib.dump(pca, "../models/pca.pkl")
print("Saved ../models/pca.pkl")