In [1]:
from pandas import DataFrame
from sklearn import datasets
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
def calc_inertia(matrix):
    return (matrix ** 2).sum().sum()


def sum_of_squared_errors(true, pred):
    """Residual sum of squares"""
    return ((true - pred)**2).sum().sum()

In [3]:
def calc_r2(pipeline: Pipeline, data, components_n=10):
    """Assumes that the PCA is the last step of the pipeline."""
    result = []

    pipeline.fit(data)

    pca = pipeline.steps[-1][1]
    components_fitted = pca.n_components_

    preprocessing_pipeline = Pipeline(pipeline.steps[:-1])
    preprocessing_pipeline.fit(data)

    x_processed = preprocessing_pipeline.transform(data)
    total_inertia = calc_inertia(x_processed)

    scores = pipeline.transform(data)
    loadings = pca.components_

    components_n = min(components_n, components_fitted)

    for k in range(1, components_n + 1):
        x_hat = scores[:, :k] @ loadings[:k, :]

        # REsidual Sum of Squares
        ress = sum_of_squared_errors(x_processed, x_hat)
        explained = calc_inertia(x_hat)

        result.append({
            'n': k,
            'r2_ratio': explained / total_inertia,
            'r2_ress': 1 - ress / total_inertia,
            'r2_sklearn': sum(pca.explained_variance_ratio_[:k])
        })

    return DataFrame(result)

In [4]:
iris = datasets.load_iris()

In [5]:
pipeline = make_pipeline(StandardScaler(), PCA())

In [6]:
pca_r2 = calc_r2(pipeline, iris.data)
pca_r2

Unnamed: 0,n,r2_ratio,r2_ress,r2_sklearn
0,1,0.729624,0.729624,0.729624
1,2,0.958132,0.958132,0.958132
2,3,0.994821,0.994821,0.994821
3,4,1.0,1.0,1.0
