# Comparing NipalsPCA and sklearn PCA Explained Variance

This notebook compares the explained variance delivered by NipalsPCA and sklearn's PCA when applied to the PCA test data from the repository.

## 1. Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from open_nipals.nipalsPCA import NipalsPCA

warnings.filterwarnings("ignore")

## 2. Load PCA Test Data

In [None]:
# Define path to data directory
work_dir = Path.cwd()
data_path = work_dir.joinpath("..", "..", "data")

# Load PCA test data from Excel file
pca_test_data = pd.read_excel(
    data_path.joinpath("PCATestData.xlsx"), header=None
).to_numpy()
pca_test_data

array([[6.70748286e-03, 9.67644339e-03, 7.07724403e-03, ...,
        2.07371436e-03, 1.86955582e-03, 6.46034406e-04],
       [5.33032712e-03, 6.09655360e-03, 7.18617507e-03, ...,
        5.43407590e-03, 7.28326025e-03, 2.91585148e-03],
       [5.29547887e-03, 8.42974850e-03, 1.06121059e-02, ...,
        3.53183554e-03, 2.81749636e-03, 4.83817120e-03],
       ...,
       [2.85331193e-03, 9.35229260e-03, 1.07547645e-02, ...,
        7.63081101e-03, 6.82906394e-05, 9.32609594e-03],
       [9.28893720e-03, 6.48875669e-03, 1.68109690e-03, ...,
        4.36248450e-03, 4.13362233e-03, 7.04727154e-03],
       [9.80009828e-03, 4.20128941e-03, 2.38190197e-03, ...,
        6.81898238e-03, 8.77817101e-03, 7.19562145e-03]], shape=(40, 1000))

## 3. Standardize the Data

In [4]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(pca_test_data)

## 4. Train NipalsPCA Model

In [13]:
# Train NipalsPCA model with 5 components
n_components = 5
nipals_pca = NipalsPCA(n_components=n_components)
nipals_pca.fit(data_scaled)

# Get explained variance from NipalsPCA
nipals_explained_var = nipals_pca.explained_variance_

## 5. Train sklearn PCA Model

In [14]:
# Train sklearn PCA model with 5 components
sklearn_pca = PCA(n_components=n_components)
sklearn_pca.fit(data_scaled)

# Get explained variance from sklearn
sklearn_explained_var = sklearn_pca.explained_variance_ratio_

## 6. Compare Explained Variance

In [15]:
# Create comparison dataframe
comparison_df = pd.DataFrame(
    {
        "Component": [f"PC{i + 1}" for i in range(n_components)],
        "NipalsPCA": nipals_explained_var,
        "sklearn PCA": sklearn_explained_var,
        "Difference": nipals_explained_var - sklearn_explained_var,
        "Relative Error (%)": 100
        * np.abs(nipals_explained_var - sklearn_explained_var)
        / np.abs(sklearn_explained_var),
    }
)

print("\nComparison of Explained Variance:")
print(comparison_df.to_string(index=False))
print(
    f"\nMax absolute difference: {np.abs(nipals_explained_var - sklearn_explained_var).max():.2e}"
)
print(
    f"Max relative error: {(100 * np.abs(nipals_explained_var - sklearn_explained_var) / np.abs(sklearn_explained_var)).max():.4f}%"
)


Comparison of Explained Variance:
Component  NipalsPCA  sklearn PCA    Difference  Relative Error (%)
      PC1   0.466021     0.466021 -2.102762e-13        4.512163e-11
      PC2   0.261290     0.261290  2.156053e-13        8.251576e-11
      PC3   0.012234     0.012130  1.038205e-04        8.558874e-01
      PC4   0.011807     0.011583  2.240713e-04        1.934453e+00
      PC5   0.011539     0.011413  1.263234e-04        1.106881e+00

Max absolute difference: 2.24e-04
Max relative error: 1.9345%
